Following are the necessary libraries imported
library(ggplot2)
library(caret)
## Loading required package: lattice
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(vip)
##
## Attaching package: 'vip'
## The following object is masked from 'package:utils':
##
## vi
library(ranger)
library(rpart)
library(rpart.plot)
library(pROC)
## Type 'citation("pROC")' for a citation.
##
## Attaching package: 'pROC'
## The following objects are masked from 'package:stats':
##
## cov, smooth, var
library(xgboost)
##
## Attaching package: 'xgboost'
## The following object is masked from 'package:dplyr':
##
## slice
library(Matrix)
library(DiagrammeR)
library(e1071)
options(warn = -1)
#Reading the csv file
insurance.data.dup <- read.csv("~/Documents/GitHub/GitHub/Insurance-Claim-Prediction/data/insurance.csv")
insurance.data <- insurance.data.dup
str(insurance.data)
## 'data.frame': 1338 obs. of 8 variables:
## $ age : int 19 18 28 33 32 31 46 37 37 60 ...
## $ sex : int 0 1 1 1 1 0 0 0 1 0 ...
## $ bmi : num 27.9 33.8 33 22.7 28.9 ...
## $ children : int 0 1 3 0 0 0 1 3 2 0 ...
## $ smoker : int 1 0 0 0 0 0 0 0 0 0 ...
## $ region : int 3 2 2 1 1 2 2 1 0 1 ...
## $ charges : num 16885 1726 4449 21984 3867 ...
## $ insuranceclaim: int 1 1 0 0 1 0 1 0 0 0 ...
nrow(insurance.data)
## [1] 1338
There are total of 1338 observations with the explanatory variables such as age, sex, bmi, children, smoker, region, charges as well as the response variables such as insuranceclaim
#Data Preprocessing
Following code removes the duplicates from the data if there are any.
insurance.data <- unique(insurance.data)
nrow(insurance.data)
## [1] 1337
Count of the data is 1337 after applying the unique function on the data.
print("sex")
## [1] "sex"
table(insurance.data$sex)
##
## 0 1
## 662 675
print("children")
## [1] "children"
table(insurance.data$children)
##
## 0 1 2 3 4 5
## 573 324 240 157 25 18
print("smoker")
## [1] "smoker"
table(insurance.data$smoker)
##
## 0 1
## 1063 274
print("region")
## [1] "region"
table(insurance.data$region)
##
## 0 1 2 3
## 324 324 364 325
Predictors variables Sex, children, smoker and region contians intermediate levels such as 2, 6, 2, 4 respectively. There are total of 662 females and 675 males. Total count of non smokers is 1063 when compared to the smokers of 274. There are total of 4 regions and their mapping is as follows northeast=0, northwest=1, southeast=2, southwest=3 with the count of 324, 324, 364 and 325 respectively. Children feature indicates the number of children or the dependent’s where as no dependents are 573, 1 dependent’s as 324, 2 dependent’s as 240, 3 dependent’s as 157, 4 dependents as 25 and 5 as 18.
table(insurance.data$insuranceclaim)
##
## 0 1
## 555 782
Response variables contains two level such as 0 and 1 where o indicates the no claim and 1 indicates claim and their counts are 555 and 782 respectively.
Following displays the unique values of each explanatory variable
sapply(insurance.data, function(x) unique(x))
## $age
## [1] 19 18 28 33 32 31 46 37 60 25 62 23 56 27 52 30 34 59 63 55 22 26 35 24 41
## [26] 38 36 21 48 40 58 53 43 64 20 61 44 57 29 45 54 49 47 51 42 50 39
##
## $sex
## [1] 0 1
##
## $bmi
## [1] 27.900 33.770 33.000 22.705 28.880 25.740 33.440 27.740 29.830 25.840
## [11] 26.220 26.290 34.400 39.820 42.130 24.600 30.780 23.845 40.300 35.300
## [21] 36.005 32.400 34.100 31.920 28.025 27.720 23.085 32.775 17.385 36.300
## [31] 35.600 26.315 28.600 28.310 36.400 20.425 32.965 20.800 36.670 39.900
## [41] 26.600 36.630 21.780 30.800 37.050 37.300 38.665 34.770 24.530 35.200
## [51] 35.625 33.630 28.000 34.430 28.690 36.955 31.825 31.680 22.880 37.335
## [61] 27.360 33.660 24.700 25.935 22.420 28.900 39.100 36.190 23.980 24.750
## [71] 28.500 28.100 32.010 27.400 34.010 29.590 35.530 39.805 26.885 38.285
## [81] 37.620 41.230 34.800 22.895 31.160 27.200 26.980 39.490 24.795 31.300
## [91] 38.280 19.950 19.300 31.600 25.460 30.115 29.920 27.500 28.400 30.875
## [101] 27.940 35.090 29.700 35.720 32.205 28.595 49.060 27.170 23.370 37.100
## [111] 23.750 28.975 31.350 33.915 28.785 28.300 37.400 17.765 34.700 26.505
## [121] 22.040 35.900 25.555 28.050 25.175 31.900 36.000 32.490 25.300 29.735
## [131] 38.830 30.495 37.730 37.430 24.130 37.145 39.520 24.420 27.830 36.850
## [141] 39.600 29.800 29.640 28.215 37.000 33.155 18.905 41.470 30.300 15.960
## [151] 33.345 37.700 27.835 29.200 26.410 30.690 41.895 30.900 32.200 32.110
## [161] 31.570 26.200 30.590 32.800 18.050 39.330 32.230 24.035 36.080 22.300
## [171] 26.400 31.800 26.730 23.100 23.210 33.700 33.250 24.640 33.880 38.060
## [181] 41.910 31.635 36.195 17.800 24.510 22.220 38.390 29.070 22.135 26.800
## [191] 30.020 35.860 20.900 17.290 34.210 25.365 40.150 24.415 25.200 26.840
## [201] 24.320 42.350 19.800 32.395 30.200 29.370 34.200 27.455 27.550 20.615
## [211] 24.300 31.790 21.560 28.120 40.565 27.645 31.200 26.620 48.070 36.765
## [221] 33.400 45.540 28.820 22.990 27.700 25.410 34.390 22.610 37.510 38.000
## [231] 33.330 34.865 33.060 35.970 31.400 25.270 40.945 34.105 36.480 33.800
## [241] 36.700 36.385 34.500 32.300 27.600 29.260 35.750 23.180 25.600 35.245
## [251] 43.890 20.790 30.500 21.700 21.890 24.985 32.015 30.400 21.090 22.230
## [261] 32.900 24.890 31.460 17.955 30.685 43.340 39.050 30.210 31.445 19.855
## [271] 31.020 38.170 20.600 47.520 20.400 38.380 24.310 23.600 21.120 30.030
## [281] 17.480 20.235 17.195 23.900 35.150 35.640 22.600 39.160 27.265 29.165
## [291] 16.815 33.100 26.900 33.110 31.730 46.750 29.450 32.680 33.500 43.010
## [301] 36.520 26.695 25.650 29.600 38.600 23.400 46.530 30.140 30.000 38.095
## [311] 28.380 28.700 33.820 24.090 32.670 25.100 32.560 41.325 39.500 34.300
## [321] 31.065 21.470 25.080 43.400 25.700 27.930 39.200 26.030 30.250 28.930
## [331] 35.700 35.310 31.000 44.220 26.070 25.800 39.425 40.480 38.900 47.410
## [341] 35.435 46.700 46.200 21.400 23.800 44.770 32.120 29.100 37.290 43.120
## [351] 36.860 34.295 23.465 45.430 23.650 20.700 28.270 35.910 29.000 19.570
## [361] 31.130 21.850 40.260 33.725 29.480 32.600 37.525 23.655 37.800 19.000
## [371] 21.300 33.535 42.460 38.950 36.100 29.300 39.700 38.190 42.400 34.960
## [381] 42.680 31.540 29.810 21.375 40.810 17.400 20.300 18.500 26.125 41.690
## [391] 24.100 36.200 40.185 39.270 34.870 44.745 29.545 23.540 40.470 40.660
## [401] 36.600 35.400 27.075 28.405 21.755 40.280 30.100 32.100 23.700 35.500
## [411] 29.150 27.000 37.905 22.770 22.800 34.580 27.100 19.475 26.700 34.320
## [421] 24.400 41.140 22.515 41.800 26.180 42.240 26.510 35.815 41.420 36.575
## [431] 42.940 21.010 24.225 17.670 31.500 31.100 32.780 32.450 50.380 47.600
## [441] 25.400 29.900 43.700 24.860 28.800 29.500 29.040 38.940 44.000 20.045
## [451] 40.920 35.100 29.355 32.585 32.340 39.800 24.605 33.990 28.200 25.000
## [461] 33.200 23.200 20.100 32.500 37.180 46.090 39.930 35.800 31.255 18.335
## [471] 42.900 26.790 39.615 25.900 25.745 28.160 23.560 40.500 35.420 39.995
## [481] 34.675 20.520 23.275 36.290 32.700 19.190 20.130 23.320 45.320 34.600
## [491] 18.715 21.565 23.000 37.070 52.580 42.655 21.660 32.000 18.300 47.740
## [501] 22.100 19.095 31.240 29.925 20.350 25.850 42.750 18.600 23.870 45.900
## [511] 21.500 30.305 44.880 41.100 40.370 28.490 33.550 40.375 27.280 17.860
## [521] 33.300 39.140 21.945 24.970 23.940 34.485 21.800 23.300 36.960 21.280
## [531] 29.400 27.300 37.900 37.715 23.760 25.520 27.610 27.060 39.400 34.900
## [541] 22.000 30.360 27.800 53.130 39.710 32.870 44.700 30.970
##
## $children
## [1] 0 1 3 2 5 4
##
## $smoker
## [1] 1 0
##
## $region
## [1] 3 2 1 0
##
## $charges
## [1] 16884.924 1725.552 4449.462 21984.471 3866.855 3756.622 8240.590
## [8] 7281.506 6406.411 28923.137 2721.321 27808.725 1826.843 11090.718
## [15] 39611.758 1837.237 10797.336 2395.172 10602.385 36837.467 13228.847
## [22] 4149.736 1137.011 37701.877 6203.902 14001.134 14451.835 12268.632
## [29] 2775.192 38711.000 35585.576 2198.190 4687.797 13770.098 51194.559
## [36] 1625.434 15612.193 2302.300 39774.276 48173.361 3046.062 4949.759
## [43] 6272.477 6313.759 6079.672 20630.284 3393.356 3556.922 12629.897
## [50] 38709.176 2211.131 3579.829 23568.272 37742.576 8059.679 47496.494
## [57] 13607.369 34303.167 23244.790 5989.524 8606.217 4504.662 30166.618
## [64] 4133.642 14711.744 1743.214 14235.072 6389.378 5920.104 17663.144
## [71] 16577.780 6799.458 11741.726 11946.626 7726.854 11356.661 3947.413
## [78] 1532.470 2755.021 6571.024 4441.213 7935.291 37165.164 11033.662
## [85] 39836.519 21098.554 43578.939 11073.176 8026.667 11082.577 2026.974
## [92] 10942.132 30184.937 5729.005 47291.055 3766.884 12105.320 10226.284
## [99] 22412.648 15820.699 6186.127 3645.089 21344.847 30942.192 5003.853
## [106] 17560.380 2331.519 3877.304 2867.120 47055.532 10825.254 11881.358
## [113] 4646.759 2404.734 11488.317 30259.996 11381.325 19107.780 8601.329
## [120] 6686.431 7740.337 1705.624 2257.475 39556.495 10115.009 3385.399
## [127] 17081.080 9634.538 32734.186 6082.405 12815.445 13616.359 11163.568
## [134] 1632.564 2457.211 2155.682 1261.442 2045.685 27322.734 2166.732
## [141] 27375.905 3490.549 18972.495 18157.876 20745.989 5138.257 40720.551
## [148] 9877.608 10959.695 1842.519 5125.216 7789.635 6334.344 19964.746
## [155] 7077.189 6948.701 21223.676 15518.180 36950.257 19749.383 21348.706
## [162] 36149.484 10450.552 5152.134 5028.147 10407.086 4830.630 6128.797
## [169] 2719.280 4827.905 13405.390 8116.680 1694.796 5246.047 2855.438
## [176] 48824.450 6455.863 10436.096 8823.279 8538.288 11735.879 1631.821
## [183] 4005.423 7419.478 7731.427 43753.337 3981.977 5325.651 6775.961
## [190] 4922.916 12557.605 4883.866 2137.654 12044.342 1137.470 1639.563
## [197] 5649.715 8516.829 9644.253 14901.517 2130.676 8871.152 13012.209
## [204] 37133.898 7147.105 4337.735 11743.299 20984.094 13880.949 6610.110
## [211] 1980.070 8162.716 3537.703 5002.783 8520.026 7371.772 10355.641
## [218] 2483.736 3392.977 25081.768 5012.471 10564.885 5253.524 34779.615
## [225] 19515.542 11987.168 2689.495 24227.337 7358.176 9225.256 7443.643
## [232] 14001.287 1727.785 12333.828 6710.192 19444.266 1615.767 4463.205
## [239] 17352.680 7152.671 38511.628 5354.075 35160.135 7196.867 29523.166
## [246] 24476.479 12648.703 1986.933 1832.094 4040.558 12829.455 47305.305
## [253] 44260.750 4260.744 41097.162 13047.332 43921.184 5400.980 11520.100
## [260] 33750.292 11837.160 17085.268 24869.837 36219.405 20462.998 46151.124
## [267] 17179.522 14590.632 7441.053 9282.481 1719.436 42856.838 7265.703
## [274] 9617.662 2523.169 9715.841 2803.698 2150.469 12928.791 9855.131
## [281] 22331.567 48549.178 4237.127 11879.104 9625.920 7742.110 9432.925
## [288] 14256.193 47896.791 25992.821 3172.018 20277.808 42112.236 2156.752
## [295] 3906.127 1704.568 16297.846 21978.677 38746.355 9249.495 6746.743
## [302] 24873.385 12265.507 4349.462 12646.207 19442.354 20177.671 4151.029
## [309] 11944.594 7749.156 8444.474 1737.376 42124.515 8124.408 34838.873
## [316] 9722.770 8835.265 10435.065 7421.195 4667.608 4894.753 24671.663
## [323] 35491.640 11566.301 2866.091 6600.206 3561.889 42760.502 47928.030
## [330] 9144.565 48517.563 24393.622 13429.035 11658.379 19144.577 13822.803
## [337] 12142.579 13937.666 41919.097 8232.639 18955.220 13352.100 13217.094
## [344] 13981.850 10977.206 6184.299 4889.999 8334.458 5478.037 1635.734
## [351] 11830.607 8932.084 3554.203 12404.879 14133.038 24603.048 8944.115
## [358] 9620.331 1837.282 1607.510 10043.249 4751.070 13844.506 2597.779
## [365] 3180.510 9778.347 13430.265 8017.061 8116.269 3481.868 13415.038
## [372] 12029.287 7639.417 36085.219 1391.529 18033.968 21659.930 38126.247
## [379] 16455.708 27000.985 15006.579 42303.692 20781.489 5846.918 8302.536
## [386] 1261.859 11856.412 30284.643 3176.816 4618.080 10736.871 2138.071
## [393] 8964.061 9290.139 9411.005 7526.706 8522.003 16586.498 14988.432
## [400] 1631.668 9264.797 8083.920 14692.669 10269.460 3260.199 11396.900
## [407] 4185.098 8539.671 6652.529 4074.454 1621.340 19594.810 14455.644
## [414] 5080.096 2134.901 7345.727 9140.951 18608.262 14418.280 28950.469
## [421] 46889.261 46599.108 39125.332 2727.395 8968.330 9788.866 6555.070
## [428] 7323.735 3167.456 18804.752 23082.955 4906.410 5969.723 12638.195
## [435] 4243.590 13919.823 2254.797 5926.846 12592.534 2897.323 4738.268
## [442] 37079.372 1149.396 28287.898 26109.329 7345.084 12731.000 11454.022
## [449] 5910.944 4762.329 7512.267 4032.241 1969.614 1769.532 4686.389
## [456] 21797.000 11881.970 11840.775 10601.412 7682.670 10381.479 22144.032
## [463] 15230.324 11165.418 1632.036 19521.968 13224.693 12643.378 23288.928
## [470] 2201.097 2497.038 2203.472 1744.465 20878.784 25382.297 28868.664
## [477] 35147.528 2534.394 1534.304 1824.285 15555.189 9304.702 1622.188
## [484] 9880.068 9563.029 4347.023 12475.351 1253.936 48885.136 10461.979
## [491] 1748.774 24513.091 2196.473 12574.049 17942.106 1967.023 4931.647
## [498] 8027.968 8211.100 13470.860 36197.699 6837.369 22218.115 32548.340
## [505] 5974.385 6796.863 2643.269 3077.095 3044.213 11455.280 11763.001
## [512] 2498.414 9361.327 1256.299 21082.160 11362.755 27724.289 8413.463
## [519] 5240.765 3857.759 25656.575 3994.178 9866.305 5397.617 38245.593
## [526] 11482.635 24059.680 9861.025 8342.909 1708.001 48675.518 14043.477
## [533] 12925.886 19214.706 13831.115 6067.127 5972.378 8825.086 8233.097
## [540] 27346.042 6196.448 3056.388 13887.204 63770.428 10231.500 23807.241
## [547] 3268.847 11538.421 3213.622 45863.205 13390.559 3972.925 12957.118
## [554] 11187.657 17878.901 3847.674 8334.590 3935.180 39983.426 1646.430
## [561] 9193.838 10923.933 2494.022 9058.730 2801.259 2128.431 6373.557
## [568] 7256.723 11552.904 45702.022 3761.292 2219.445 4753.637 31620.001
## [575] 13224.057 12222.898 1665.000 58571.074 9724.530 3206.491 12913.992
## [582] 6356.271 17626.240 1242.816 4779.602 3861.210 43943.876 13635.638
## [589] 5976.831 11842.442 8428.069 2566.471 15359.104 5709.164 8823.986
## [596] 7640.309 5594.846 7441.501 33471.972 1633.044 9174.136 11070.535
## [603] 16085.128 17468.984 9283.562 3558.620 25678.778 4435.094 39241.442
## [610] 8547.691 6571.544 2207.697 6753.038 1880.070 42969.853 11658.115
## [617] 23306.547 34439.856 10713.644 3659.346 40182.246 9182.170 34617.841
## [624] 12129.614 3736.465 6748.591 11326.715 11365.952 42983.459 10085.846
## [631] 1977.815 3366.670 7173.360 9391.346 14410.932 2709.112 24915.046
## [638] 20149.323 12949.155 6666.243 32787.459 13143.865 4466.621 18806.145
## [645] 10141.136 6123.569 8252.284 1712.227 12430.953 9800.888 10579.711
## [652] 8280.623 8527.532 12244.531 24667.419 3410.324 4058.712 26392.260
## [659] 14394.398 6435.624 22192.437 5148.553 1136.399 27037.914 42560.430
## [666] 8703.456 40003.332 45710.208 6500.236 4837.582 3943.595 4399.731
## [673] 6185.321 46200.985 7222.786 12485.801 46130.526 12363.547 10156.783
## [680] 2585.269 1242.260 40103.890 9863.472 4766.022 11244.377 7729.646
## [687] 5438.749 26236.580 34806.468 2104.113 8068.185 2362.229 2352.968
## [694] 3577.999 3201.245 29186.482 40273.645 10976.246 3500.612 2020.552
## [701] 9541.696 9504.310 5385.338 8930.935 5375.038 44400.406 10264.442
## [708] 6113.231 5469.007 1727.540 10107.221 8310.839 1984.453 2457.502
## [715] 12146.971 9566.991 13112.605 10848.134 12231.614 9875.680 11264.541
## [722] 12979.358 1263.249 10106.134 40932.429 6664.686 16657.717 2217.601
## [729] 6781.354 19361.999 10065.413 4234.927 9447.250 14007.222 9583.893
## [736] 40419.019 3484.331 36189.102 44585.456 8604.484 18246.496 43254.418
## [743] 3757.845 8827.210 9910.360 11737.849 1627.282 8556.907 3062.508
## [750] 19539.243 1906.358 14210.536 11833.782 17128.426 5031.270 7985.815
## [757] 23065.421 5428.728 36307.798 3925.758 2416.955 19040.876 3070.809
## [764] 9095.068 11842.624 8062.764 7050.642 14319.031 6933.242 27941.288
## [771] 11150.780 12797.210 17748.506 7261.741 10560.492 6986.697 7448.404
## [778] 5934.380 9869.810 18259.216 1146.797 9386.161 24520.264 4350.514
## [785] 6414.178 12741.167 1917.318 5209.579 13457.961 5662.225 1252.407
## [792] 2731.912 21195.818 7209.492 18310.742 4266.166 4719.524 11848.141
## [799] 17904.527 7046.722 14313.846 2103.080 38792.686 1815.876 7731.858
## [806] 28476.735 2136.882 1131.507 3309.793 9414.920 6360.994 11013.712
## [813] 4428.888 5584.306 1877.929 2842.761 3597.596 23401.306 55135.402
## [820] 7445.918 2680.949 1621.883 8219.204 12523.605 16069.085 43813.866
## [827] 20773.628 39597.407 6117.494 13393.756 5266.366 4719.737 11743.934
## [834] 5377.458 7160.330 4402.233 11657.719 6402.291 12622.180 1526.312
## [841] 12323.936 36021.011 27533.913 10072.055 45008.955 9872.701 2438.055
## [848] 2974.126 10601.632 37270.151 14119.620 42111.665 11729.680 24106.913
## [855] 1875.344 40974.165 15817.986 18218.161 10965.446 46113.511 7151.092
## [862] 12269.689 5458.046 8782.469 6600.361 1141.445 11576.130 13129.603
## [869] 4391.652 8457.818 3392.365 5966.887 6849.026 8891.139 2690.114
## [876] 26140.360 6653.789 6282.235 6311.952 3443.064 2789.057 2585.851
## [883] 46255.113 4877.981 19719.695 27218.437 5272.176 1682.597 11945.133
## [890] 29330.983 7243.814 10422.917 44202.654 13555.005 13063.883 19798.055
## [897] 2221.564 1634.573 2117.339 8688.859 48673.559 4661.286 8125.784
## [904] 12644.589 4564.191 4846.920 7633.721 15170.069 17496.306 2639.043
## [911] 33732.687 14382.709 7626.993 5257.508 2473.334 21774.322 35069.375
## [918] 13041.921 5245.227 13451.122 13462.520 5488.262 4320.411 6250.435
## [925] 25333.333 2913.569 12032.326 13470.804 6289.755 2927.065 6238.298
## [932] 10096.970 7348.142 4673.392 12233.828 32108.663 8965.796 2304.002
## [939] 9487.644 1121.874 9549.565 2217.469 1628.471 12982.875 11674.130
## [946] 7160.094 39047.285 6358.776 19933.458 11534.873 47462.894 4527.183
## [953] 38998.546 20009.634 3875.734 41999.520 12609.887 41034.221 28468.919
## [960] 2730.108 3353.284 14474.675 9500.573 26467.097 4746.344 23967.383
## [967] 7518.025 3279.869 8596.828 10702.642 4992.376 2527.819 1759.338
## [974] 2322.622 16138.762 7804.160 2902.907 9704.668 4889.037 25517.114
## [981] 4500.339 19199.944 16796.412 4915.060 7624.630 8410.047 28340.189
## [988] 4518.826 14571.891 3378.910 7144.863 10118.424 5484.467 16420.495
## [995] 7986.475 7418.522 13887.969 6551.750 5267.818 17361.766 34472.841
## [1002] 1972.950 21232.182 8627.541 4433.388 4438.263 24915.221 23241.475
## [1009] 9957.722 8269.044 18767.738 36580.282 8765.249 5383.536 12124.992
## [1016] 2709.244 3987.926 12495.291 26018.951 8798.593 35595.590 42211.138
## [1023] 1711.027 8569.862 2020.177 16450.895 21595.382 9850.432 6877.980
## [1030] 21677.283 44423.803 4137.523 13747.872 12950.071 12094.478 37484.449
## [1037] 39725.518 2250.835 22493.660 20234.855 1704.700 33475.817 3161.454
## [1044] 11394.066 21880.820 7325.048 44501.398 3594.171 39727.614 8023.135
## [1051] 14394.558 9288.027 25309.489 3353.470 10594.502 8277.523 17929.303
## [1058] 2480.979 4462.722 1981.582 11554.224 48970.248 6548.195 5708.867
## [1065] 7045.499 8978.185 5757.413 14349.854 10928.849 39871.704 13974.456
## [1072] 1909.527 12096.651 13204.286 4562.842 8551.347 2102.265 34672.147
## [1079] 15161.534 11884.049 4454.403 5855.903 4076.497 15019.760 19023.260
## [1086] 10796.350 11353.228 9748.911 10577.087 41676.081 11286.539 3591.480
## [1093] 33907.548 11299.343 4561.189 44641.197 1674.632 23045.566 3227.121
## [1100] 16776.304 11253.421 3471.410 11363.283 20420.605 10338.932 8988.159
## [1107] 10493.946 2904.088 8605.362 11512.405 41949.244 24180.933 5312.170
## [1114] 2396.096 10807.486 9222.403 36124.574 38282.749 5693.431 34166.273
## [1121] 8347.164 46661.442 18903.491 40904.200 14254.608 10214.636 5836.520
## [1128] 14358.364 1728.897 8582.302 3693.428 20709.020 9991.038 19673.336
## [1135] 11085.587 7623.518 3176.288 3704.354 36898.733 9048.027 7954.517
## [1142] 27117.994 6338.076 9630.397 11289.109 52590.829 2261.569 10791.960
## [1149] 5979.731 2203.736 12235.839 40941.285 5630.458 11015.175 7228.216
## [1156] 39722.746 14426.074 2459.720 3989.841 7727.253 5124.189 18963.172
## [1163] 2200.831 7153.554 5227.989 10982.501 4529.477 4670.640 6112.353
## [1170] 17178.682 22478.600 11093.623 6457.843 4433.916 2154.361 23887.663
## [1177] 6496.886 2899.489 19350.369 7650.774 2850.684 2632.992 9447.382
## [1184] 18328.238 8603.823 37465.344 13844.797 21771.342 13126.677 5327.400
## [1191] 13725.472 13019.161 8671.191 4134.082 18838.704 33307.551 5699.837
## [1198] 6393.603 4934.705 6198.752 8733.229 2055.325 9964.060 18223.451
## [1205] 5116.500 36910.608 38415.474 20296.863 12347.172 5373.364 23563.016
## [1212] 1702.455 10806.839 3956.071 12890.058 5415.661 4058.116 41661.602
## [1219] 7537.164 4718.204 6593.508 8442.667 26125.675 6858.480 4795.657
## [1226] 6640.545 7162.012 10594.226 11938.256 60021.399 20167.336 12479.709
## [1233] 11345.519 8515.759 2699.568 14449.854 12224.351 6985.507 3238.436
## [1240] 47269.854 49577.662 4296.271 3171.615 1135.941 5615.369 9101.798
## [1247] 6059.173 1633.962 37607.528 18648.422 1241.565 16232.847 15828.822
## [1254] 4415.159 6474.013 11436.738 11305.935 30063.581 10197.772 4544.235
## [1261] 3277.161 6770.193 7337.748 10370.913 26926.514 10704.470 34254.053
## [1268] 1880.487 8615.300 3292.530 3021.809 14478.330 4747.053 17043.341
## [1275] 10959.330 2741.948 4357.044 22462.044 4189.113 8283.681 24535.699
## [1282] 14283.459 1720.354 47403.880 8534.672 3732.625 5472.449 38344.566
## [1289] 7147.473 7133.903 34828.654 1515.345 9301.894 11931.125 1964.780
## [1296] 1708.926 4340.441 5261.469 2710.829 62592.873 46718.163 3208.787
## [1303] 37829.724 21259.378 2464.619 16115.305 21472.479 33900.653 6875.961
## [1310] 6940.910 4571.413 4536.259 36397.576 18765.875 11272.331 1731.677
## [1317] 1163.463 19496.719 7201.701 5425.023 28101.333 12981.346 43896.376
## [1324] 4239.893 13143.337 7050.021 9377.905 22395.744 10325.206 12629.166
## [1331] 10795.937 11411.685 10600.548 2205.981 1629.833 2007.945 29141.360
##
## $insuranceclaim
## [1] 1 0
Following code is to conver the columns sex, children, smoker, insuranceclaim and region columns to factor levels
column_names <- c(
"sex", "children", "smoker", "insuranceclaim", "region"
)
# Convert the selected columns to factors in our data frame
insurance.data[, column_names] <- lapply(insurance.data[, column_names], as.factor)
str(insurance.data)
## 'data.frame': 1337 obs. of 8 variables:
## $ age : int 19 18 28 33 32 31 46 37 37 60 ...
## $ sex : Factor w/ 2 levels "0","1": 1 2 2 2 2 1 1 1 2 1 ...
## $ bmi : num 27.9 33.8 33 22.7 28.9 ...
## $ children : Factor w/ 6 levels "0","1","2","3",..: 1 2 4 1 1 1 2 4 3 1 ...
## $ smoker : Factor w/ 2 levels "0","1": 2 1 1 1 1 1 1 1 1 1 ...
## $ region : Factor w/ 4 levels "0","1","2","3": 4 3 3 2 2 3 3 2 1 2 ...
## $ charges : num 16885 1726 4449 21984 3867 ...
## $ insuranceclaim: Factor w/ 2 levels "0","1": 2 2 1 1 2 1 2 1 1 1 ...
Here, we are checking for the null values in any of the column of the dataframe as they could mislead our model’s we fit and predictions.
any(is.na(insurance.data))
## [1] FALSE
colSums(is.na(insurance.data)) > 0
## age sex bmi children smoker
## FALSE FALSE FALSE FALSE FALSE
## region charges insuranceclaim
## FALSE FALSE FALSE
There are no such records in out dataframe
Following histogram displays the distribution of the “claims” as well as the “no claims”. No claims were acconted to total of 41.5% of the response variables and claims were total of 58.5%.
percentage_data <- table(insurance.data$insuranceclaim) / nrow(insurance.data) * 100
# Create a data frame for plotting
plot_data <- data.frame(insuranceclaim = as.factor(names(percentage_data)),
percentage = as.numeric(percentage_data))
# Plotting
ggplot(plot_data, aes(x = insuranceclaim, y = percentage)) +
geom_bar(stat = "identity", fill = "skyblue", color = "black") +
geom_text(aes(label = sprintf("%.1f%%", percentage)),
position = position_stack(vjust = 0.5), # Adjust vjust for vertical position
color = "black", size = 3) +
labs(title = "Distribution of Insurance Claims: Non-Claims (0) vs. Claims (1)",
x = "Insurance Claim",
y = "Percentage")
Following code is to split the data into the trian and test split’s in the proportion of 80% and 20%
# Binomial Logit Model - 80-20 split
set.seed(123457)
train.prop <- 0.80
strats <- insurance.data$insuranceclaim
rr <- split(1:length(strats), strats)
idx <- sort(as.numeric(unlist(sapply(rr,
function(x) sample(x, length(x)*train.prop)))))
insurance.data.train <- insurance.data[idx, ]
insurance.data.test <- insurance.data[-idx, ]
Following are the distributions of the claims and no claims in the train and test data frames with similar distributions.
#check for equal proportions of number of claims
table(insurance.data.train$insuranceclaim)/nrow(insurance.data.train)
##
## 0 1
## 0.4153414 0.5846586
table(insurance.data.test$insuranceclaim)/nrow(insurance.data.test)
##
## 0 1
## 0.4141791 0.5858209
Basis on the response variable with 0’s and 1’s which is a binomial we can apply binary logit model
A binary random variable \(Y\) can assume only one of two possible values, a value of \(1\) (Yes) or a value of \(0\) (No).
A binary random variable \(Y\) has a Bernoulli(\(\pi\)) distribution with
\[ P(Y=1)=\pi = 1-P(Y=0), \] {#eq-bernoulli}
and probability mass function (p.m.f.)
\[ p(y; \pi) = \pi^y (1-\pi)^{1-y},~y = 0 \mbox{ or } 1; 0 \le \pi \le 1. \] {#eq-pmfBern}
A useful transformation of \(\pi\) is the logit (or, log-odds) transformation:
\[ \text{logit}(\pi) = \log \left(\frac{\pi}{1-\pi}\right) \] {#eq-logitpi}
Note: we looked at log odds in @sec-ch3TwoSamp.
Let \(\eta = \text{logit}(\pi)\). After some algebra, we see that we can uniquely write \(\pi\) as a function of \(\eta\), i.e., the inverse transformation is
\[ \pi = \frac{\exp(\eta)}{1+\exp(\eta)} \] {#eq-invlogit}
The binary logit (or, logistic regression) model is a generalized linear model (GLIM) for explaining binary responses \(Y_i\). Our goal is to model the binary responses as functions of \(p\) independent variables denoted by \(X_{i,j},~j=1,\ldots,p\) for each \(i\).
The random component of the GLIM is
\[ Y_i | \pi_i \sim \mbox{Bernoulli}(\pi_i). \] {#eq-logitglim1}
The systematic component is
\[ \eta_i = \beta_0 + \sum_{j=1}^p \beta_j X_{i,j} = \mathbf{x}_i' \boldsymbol{\beta}. \] {#eq-binarysys}
with \(\boldsymbol{\beta} = (\beta_0, \beta_1, \ldots,\beta_p)'\), and \(\mathbf{x}_i = (1, X_{i,1},\ldots, X_{i,p})'\).
The logit link function relates the \(i\)th mean response \(\pi_i\) to the systematic component \(\eta_i\):
\[ \mbox{logit}(\pi_i | \mathbf{x}_i) = \log\left(\frac{\pi_i}{1-\pi_i}\right) = \eta_i. \] {#eq-logitglim2}
Since the mean response \(\pi_i\) must lie in the interval \((0,1)\), whereas \(\eta_i\) is real-valued, we need a function such as the logit function to link the two in a correct way.
By inverting the logit link function (see @eq-invlogit), we can write the binary logit model as
\[ \pi_i = P(Y_i =1 | \mathbf{x}_i) = \frac{\exp(\beta_0 + \sum_{j=1}^p \beta_j X_{i,j})}{ 1+ \exp(\beta_0 + \sum_{j=1}^p \beta_j X_{i,j})}. \] {#eq-logitglim3}
Following are the null and alternative hypothesis.
Null Hypothesis (\(H_0\)): \[ H_0: \beta_j = 0 \]
The null hypothesis asserts that there is no association between the independent variable \(X_j\) and the log-odds of the dependent variable being in the “success” category.
Alternative Hypothesis (\(H_1\)): \[ H_1: \beta_j \neq 0 \]
The alternative hypothesis suggests that the independent variable \(X_j\) does have a significant association with the log-odds of the event.
#full binary logit model
full.logit <- glm(insuranceclaim ~ . ,data = insurance.data.train,
family = binomial(link = "logit"))
summary(full.logit)
##
## Call:
## glm(formula = insuranceclaim ~ ., family = binomial(link = "logit"),
## data = insurance.data.train)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -7.733e+00 6.882e-01 -11.237 < 2e-16 ***
## age 3.137e-02 8.573e-03 3.659 0.000253 ***
## sex1 6.626e-02 1.845e-01 0.359 0.719451
## bmi 2.790e-01 2.178e-02 12.807 < 2e-16 ***
## children1 -2.183e+00 2.413e-01 -9.046 < 2e-16 ***
## children2 -3.487e+00 2.951e-01 -11.815 < 2e-16 ***
## children3 -4.880e+00 4.021e-01 -12.137 < 2e-16 ***
## children4 -5.051e+00 7.472e-01 -6.760 1.38e-11 ***
## children5 -3.880e+00 8.897e-01 -4.361 1.29e-05 ***
## smoker1 4.112e+00 4.710e-01 8.730 < 2e-16 ***
## region1 -3.719e-01 2.617e-01 -1.421 0.155270
## region2 -3.596e-01 2.668e-01 -1.348 0.177625
## region3 -3.058e-01 2.628e-01 -1.164 0.244625
## charges 1.022e-05 1.749e-05 0.584 0.559263
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1451.15 on 1068 degrees of freedom
## Residual deviance: 751.29 on 1055 degrees of freedom
## AIC: 779.29
##
## Number of Fisher Scoring iterations: 6
The intercept is -7.733. When all predictor variables are zero, the log-odds of not making an insurance claim is -7.733.
For each one-unit increase in age, the log-odds of making an insurance claim increase by 0.0314 (p < 0.001).
The coefficient for ‘sex1’ is 0.06626 with a p-value of 0.719. It is not statistically significant, suggesting that gender may not be a significant predictor of insurance claims.
For each one-unit increase in BMI, the log-odds of making an insurance claim increase by 0.279 (p < 0.001).
The coefficients for ‘children1’ through ‘children5’ represent the effect of having 1 to 5 children compared to having no children. As the number of children increases, the log-odds of making an insurance claim decrease significantly.
Smokers (smoker1) have higher log-odds of making an insurance claim compared to non-smokers (4.112, p < 0.001).
The coefficients for ‘region1’, ‘region2’, and ‘region3’ represent the effect of regions 1, 2, and 3 compared to region 4. None of the regions are statistically significant.
The coefficient for ‘charges’ is not statistically significant (p = 0.559), suggesting that charges may not be a significant predictor.
car::qqPlot(residuals(full.logit), main = NA, pch = 19, col = 2, cex = 0.7)
## 1227 681
## 982 554
shapiro.test(residuals(full.logit))
##
## Shapiro-Wilk normality test
##
## data: residuals(full.logit)
## W = 0.97353, p-value = 4.624e-13
From the above residual plot we can observe most of the data points were normal except few data points however according to the shapiro wilk statistical test we can confirm that the data is not normal.
Following null binary model was fitted with no predictors with the repsonse variable
#null binary logit model
null.logit <- glm(insuranceclaim ~ 1 ,data = insurance.data.train,
family = binomial(link = "logit"))
summary(null.logit)
##
## Call:
## glm(formula = insuranceclaim ~ 1, family = binomial(link = "logit"),
## data = insurance.data.train)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.34193 0.06207 5.509 3.61e-08 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1451.2 on 1068 degrees of freedom
## Residual deviance: 1451.2 on 1068 degrees of freedom
## AIC: 1453.2
##
## Number of Fisher Scoring iterations: 4
The intercept is 0.34193 with a standard error of 0.06207. This represents the log-odds of the baseline category (insurance claim = 0) when there are no predictor variables. The coefficient is statistically significant (p-value < 0.001).
The intercept represents the log-odds of the baseline category (insurance claim = 0) when no predictor variables are included in the model.
The model with only the intercept doesn’t provide much information about the relationship between predictors and the response variable. It serves as a baseline against which more complex models can be compared.
The AIC is relatively high, suggesting that models with additional predictors might provide a better fit to the data.
Since there are no predictor variables, the model is essentially stating that the log-odds of making an insurance claim when there are no predictors is 0.34193.
car::qqPlot(residuals(null.logit), main = NA, pch = 19, col = 2, cex = 0.7)
## 6 8
## 3 5
shapiro.test(residuals(null.logit))
##
## Shapiro-Wilk normality test
##
## data: residuals(null.logit)
## W = 0.62609, p-value < 2.2e-16
From the above residual plot we can observe most of the data points were normal except few data points however according to the shapiro wilk statistical test we can confirm that the data is not normal as we reject the null hypothesis . To select the variables which are impacting the response variable we applied the vairable selection method on top of the full logit model. Direction was set to both.
both.logit <- step(null.logit, list(lower = formula(null.logit),
upper = formula(full.logit)),
direction = "both", trace = 0, data = insurance.data.train)
formula(both.logit)
## insuranceclaim ~ children + bmi + smoker + age
summary(both.logit)
##
## Call:
## glm(formula = insuranceclaim ~ children + bmi + smoker + age,
## family = binomial(link = "logit"), data = insurance.data.train)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -7.916898 0.665702 -11.893 < 2e-16 ***
## children1 -2.184042 0.240776 -9.071 < 2e-16 ***
## children2 -3.478784 0.294150 -11.827 < 2e-16 ***
## children3 -4.868815 0.399380 -12.191 < 2e-16 ***
## children4 -5.008194 0.747916 -6.696 2.14e-11 ***
## children5 -3.886841 0.892521 -4.355 1.33e-05 ***
## bmi 0.276118 0.021134 13.065 < 2e-16 ***
## smoker1 4.281770 0.374253 11.441 < 2e-16 ***
## age 0.034454 0.007052 4.885 1.03e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1451.15 on 1068 degrees of freedom
## Residual deviance: 754.42 on 1060 degrees of freedom
## AIC: 772.42
##
## Number of Fisher Scoring iterations: 6
The model suggests that the number of children, BMI, smoking status, and age are significant predictors of insurance claims.
Smokers are associated with a significant increase in the likelihood of making an insurance claim.
Older individuals (higher age) are associated with a slight increase in the likelihood of making an insurance claim.
The model provides a significantly better fit than the null model, as evidenced by the lower residual deviance and AIC.
Interpret the coefficients cautiously. For example, the interpretation of the number of children assumes linearity, and interactions or nonlinear effects may be present.
car::qqPlot(residuals(both.logit), main = NA, pch = 19, col = 2, cex = 0.7)
## 1227 429
## 982 347
shapiro.test(residuals(both.logit))
##
## Shapiro-Wilk normality test
##
## data: residuals(both.logit)
## W = 0.9734, p-value = 4.245e-13
According to the above residual plot the data was not normal
There are outliers after fitting the model, let’s the model by eliminating the residuals which are having the variation greater than 3 times of standard deviation. Following is the code implementation
extpts <- which(abs(residuals(both.logit)) > 3*sd(residuals(both.logit)))
nrow(insurance.data.train)
## [1] 1069
length(extpts)
## [1] 15
data.train.2 <- insurance.data.train[-extpts,]
full.logit <- glm(insuranceclaim ~ . ,data = data.train.2,
family = binomial(link = "logit"))
both.logit.extpts <- step(full.logit,
direction="both",trace=0, data = data.train.2)
formula(both.logit.extpts)
## insuranceclaim ~ age + bmi + children + smoker
summary(both.logit.extpts)
##
## Call:
## glm(formula = insuranceclaim ~ age + bmi + children + smoker,
## family = binomial(link = "logit"), data = data.train.2)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -10.666757 0.852947 -12.506 < 2e-16 ***
## age 0.038961 0.007922 4.918 8.74e-07 ***
## bmi 0.371495 0.027749 13.388 < 2e-16 ***
## children1 -2.696091 0.278386 -9.685 < 2e-16 ***
## children2 -4.272616 0.353996 -12.070 < 2e-16 ***
## children3 -7.124781 0.575888 -12.372 < 2e-16 ***
## children4 -7.670886 1.080693 -7.098 1.26e-12 ***
## children5 -4.539262 1.005026 -4.517 6.29e-06 ***
## smoker1 5.928896 0.513490 11.546 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1434.90 on 1053 degrees of freedom
## Residual deviance: 610.43 on 1045 degrees of freedom
## AIC: 628.43
##
## Number of Fisher Scoring iterations: 7
car::qqPlot(residuals(both.logit.extpts), main = NA, pch = 19, col = 2, cex = 0.7)
## 660 232
## 528 182
shapiro.test(residuals(both.logit.extpts))
##
## Shapiro-Wilk normality test
##
## data: residuals(both.logit.extpts)
## W = 0.96129, p-value = 4.199e-16
From the residual plot we see the data points are deviated and are not normal.
The Akaike Information Criterion (AIC) is an information criterion used for model selection. For a model with \(p\) estimated parameters, it is defined as
\[ \text{AIC} = -2 \ell(\hat{\boldsymbol{\beta}};\mathbf{y}) + 2p. \] {#eq-AIC}
While we wish to select a model with largest maximized log-likelihood, AIC penalizes us for using a model with an unnecessarily large \(p\), the penalty term being \(2p\).
Let’s compare the AIC values
#Akaike Information Criterion
AIC(both.logit)
## [1] 772.4223
AIC(full.logit)
## [1] 634.5677
AIC(null.logit)
## [1] 1453.154
From the above values we can observe full.logit model aic values as less which compared to others
Another useful information based model selection criterion is called the Bayesian Information Criterion (BIC), which uses a different penalty \(p\log(n)\): \[ \text{BIC} = -2 \ell(\hat{\boldsymbol{\beta}};\mathbf{y}) + p \log(n) \] {#eq-BIC}
Again, a model with smaller BIC is better which is full.logit model in comparison to the both logit model.
#Baysian Information Criteria
BIC(both.logit)
## [1] 817.1926
BIC(full.logit)
## [1] 704.0126
BIC(null.logit)
## [1] 1458.129
Let’s predict the values of the test data set by the help of predict function by passing the model as well as the dataset along side the type as response where it will automatically takes care of the logit conversions.
Predictions with the help of the both.logit as well as the full.logit model.
pred.both.test <- predict(both.logit, newdata = insurance.data.test, type="response")
pred.full.test <- predict(full.logit, newdata = insurance.data.test, type="response")
(table.both <- table(pred.both.test > 0.5, insurance.data.test$insuranceclaim))
##
## 0 1
## FALSE 90 18
## TRUE 21 139
(table.full <- table(pred.full.test > 0.5, insurance.data.test$insuranceclaim))
##
## 0 1
## FALSE 95 16
## TRUE 16 141
(accuracy.both <- round((sum(diag(table.both))/sum(table.both))*100,2))
## [1] 85.45
(accuracy.full <- round((sum(diag(table.full))/sum(table.full))*100,2))
## [1] 88.06
We can observe that the accuracy of the test data by the full model is more which compared to the both model.
ROC curve. Another useful metric is area under the receiver operating characteristics (ROC) curve, which used to evaluate the prediction accuracy in binary and multi-class classification.
It quantifies the trade-off between the sensitivity or true positive rate (TPR) and specificity or false positive rate (FPR) of a prediction.
Sensitivity, or true positive (TP) is the probability that a binary response is predicted as a 1 (or, yes), given that it is an event (or, yes).
Specificity, or true negative (TN) is the probability that a binary response is predicted as a 0 (or, no), given that it is a non-event (or, no).
par(mfrow = c(1,2))
roc.both <- roc(insurance.data.test$insuranceclaim ~ pred.both.test, plot = TRUE,
legacy.axes = TRUE, print.auc = TRUE, main = "Both Model ROC Curve")
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
roc.both <- roc(insurance.data.test$insuranceclaim ~ pred.full.test, plot = TRUE,
legacy.axes = TRUE, print.auc = TRUE, main = "Full Model ROC Curve")
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
Following are the predictions, accuracy and roc curves on the train data.
pred.both <- predict(both.logit, newdata = insurance.data.train, type="response")
pred.full <- predict(full.logit, newdata = insurance.data.train, type="response")
(table.both <- table(pred.both > 0.5, insurance.data.train$insuranceclaim))
##
## 0 1
## FALSE 371 62
## TRUE 73 563
(table.full <- table(pred.full > 0.5, insurance.data.train$insuranceclaim))
##
## 0 1
## FALSE 384 63
## TRUE 60 562
(accuracy.both <- round((sum(diag(table.both))/sum(table.both))*100,2))
## [1] 87.37
(accuracy.full <- round((sum(diag(table.full))/sum(table.full))*100,2))
## [1] 88.49
par(mfrow = c(1,2))
roc.both <- roc(insurance.data.train$insuranceclaim ~ pred.both, plot = TRUE,
legacy.axes = TRUE, print.auc = TRUE, main = "Both Model ROC Curve")
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
roc.both <- roc(insurance.data.train$insuranceclaim ~ pred.full, plot = TRUE,
legacy.axes = TRUE, print.auc = TRUE, main = "Full Model ROC Curve")
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
We can observe that the train and test data accuracy were similar for
the full model where as there is a slight difference between them in the
both model. ROC curve looks similar for both as well as the full
model.
##backward
Model was fitted based on the backward direction and the aic value is 628 which is less when compared to the both model.
both.logit.backward <- step(full.logit,
direction="backward",trace=0, data = insurance.data.train)
formula(both.logit.backward)
## insuranceclaim ~ age + bmi + children + smoker
summary(both.logit.backward)
##
## Call:
## glm(formula = insuranceclaim ~ age + bmi + children + smoker,
## family = binomial(link = "logit"), data = data.train.2)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -10.666757 0.852947 -12.506 < 2e-16 ***
## age 0.038961 0.007922 4.918 8.74e-07 ***
## bmi 0.371495 0.027749 13.388 < 2e-16 ***
## children1 -2.696091 0.278386 -9.685 < 2e-16 ***
## children2 -4.272616 0.353996 -12.070 < 2e-16 ***
## children3 -7.124781 0.575888 -12.372 < 2e-16 ***
## children4 -7.670886 1.080693 -7.098 1.26e-12 ***
## children5 -4.539262 1.005026 -4.517 6.29e-06 ***
## smoker1 5.928896 0.513490 11.546 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1434.90 on 1053 degrees of freedom
## Residual deviance: 610.43 on 1045 degrees of freedom
## AIC: 628.43
##
## Number of Fisher Scoring iterations: 7
Features such as age, bmi, children and smoker are the ones which were identified by the backward model as more relevant in predicting the response variable.
However, residual plot does not loks good as it indicates the data points are not normal.
car::qqPlot(residuals(both.logit.backward), main = NA, pch = 19, col = 2, cex = 0.7)
## 660 232
## 528 182
shapiro.test(residuals(both.logit.backward))
##
## Shapiro-Wilk normality test
##
## data: residuals(both.logit.backward)
## W = 0.96129, p-value = 4.199e-16
##forward
Following is the code for the forward elimination method and it’s AIC value is 634 which is slightly greater than the backward model.
both.logit.forward <- step(full.logit,
direction="forward",trace=0, data = insurance.data.train)
formula(both.logit.forward)
## insuranceclaim ~ age + sex + bmi + children + smoker + region +
## charges
summary(both.logit.forward)
##
## Call:
## glm(formula = insuranceclaim ~ age + sex + bmi + children + smoker +
## region + charges, family = binomial(link = "logit"), data = data.train.2)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.052e+01 8.758e-01 -12.009 < 2e-16 ***
## age 3.617e-02 9.719e-03 3.722 0.000198 ***
## sex1 7.141e-02 2.071e-01 0.345 0.730266
## bmi 3.777e-01 2.861e-02 13.199 < 2e-16 ***
## children1 -2.702e+00 2.799e-01 -9.653 < 2e-16 ***
## children2 -4.288e+00 3.559e-01 -12.049 < 2e-16 ***
## children3 -7.162e+00 5.819e-01 -12.308 < 2e-16 ***
## children4 -7.697e+00 1.087e+00 -7.078 1.46e-12 ***
## children5 -4.529e+00 1.005e+00 -4.508 6.55e-06 ***
## smoker1 5.805e+00 6.032e-01 9.624 < 2e-16 ***
## region1 -4.717e-01 2.935e-01 -1.607 0.108100
## region2 -4.886e-01 2.982e-01 -1.639 0.101253
## region3 -3.348e-01 2.931e-01 -1.142 0.253393
## charges 8.773e-06 1.982e-05 0.443 0.658053
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1434.90 on 1053 degrees of freedom
## Residual deviance: 606.57 on 1040 degrees of freedom
## AIC: 634.57
##
## Number of Fisher Scoring iterations: 7
Features such as age, sex, bmi, children, region, charges and smoker are the ones which were identified by the backward model as more relevant in predicting the response variable.
car::qqPlot(residuals(both.logit.forward), main = NA, pch = 19, col = 2, cex = 0.7)
## 232 660
## 182 528
shapiro.test(residuals(both.logit.forward))
##
## Shapiro-Wilk normality test
##
## data: residuals(both.logit.forward)
## W = 0.96088, p-value = 3.394e-16
However, residual plot does not loks good as it indicates the data points are not normal.
Following are the predictions, accuracy of the test data for both forward and backward elimination models.
pred.both.forward <- predict(both.logit.forward, newdata = insurance.data.test, type="response")
pred.both.backward <- predict(both.logit.backward, newdata = insurance.data.test, type="response")
(table.both.forward <- table(pred.both.forward > 0.5, insurance.data.test$insuranceclaim))
##
## 0 1
## FALSE 95 16
## TRUE 16 141
(table.full.backward <- table(pred.both.backward > 0.5, insurance.data.test$insuranceclaim))
##
## 0 1
## FALSE 92 18
## TRUE 19 139
(accuracy.both.forward <- round((sum(diag(table.both.forward))/sum(table.both.forward))*100,2))
## [1] 88.06
(accuracy.full.backward <- round((sum(diag(table.full.backward))/sum(table.full.backward))*100,2))
## [1] 86.19
We can observe the ac curacies of both the forward and backward elimination methods are similar to that of both elimination methods.
Testing Strategy 2 - K-Fold Validation
K-fold cross-validation is a resampling technique commonly used in machine learning to assess the performance and generalization ability of a predictive model. The basic idea is to partition the dataset into k subsets (folds), train the model on k-1 folds, and evaluate it on the remaining fold. This process is repeated k times, with each of the k folds used exactly once as the validation data.
Following code implements the k-fold validation with 10 folds on the full model and the average accuracy obtained is 87%(approx).
# Binomial Full Logit Model - K fold validaton
# Set the number of folds (K)
num_folds <- 10
# Create an index vector for splitting
set.seed(123) # for reproducibility
indices <- createFolds(insurance.data$insuranceclaim, k = num_folds, list = TRUE)
# Initialize a variable to store cross-validation results
cv_results <- data.frame(Accuracies = double(num_folds))
# Perform K-Fold Cross-Validation
for (i in 1:num_folds) {
# Split the data into training and testing sets
train_data <- insurance.data[-indices[[i]], ]
test_data <- insurance.data[indices[[i]], ]
model <- glm(insuranceclaim ~ . ,data = train_data,
family = binomial(link = "logit"))
# Make predictions on the test data
predictions <- predict(model, newdata = test_data, type="response")
(table.full <- table(predictions > 0.5, test_data$insuranceclaim))
acc <- round((sum(diag(table.full))/sum(table.full))*100,2)
# Store the RMSE in the results dataframe
cv_results$Accuracies[i] <- acc
}
# Display cross-validation results
print(cv_results)
## Accuracies
## 1 86.47
## 2 87.31
## 3 87.41
## 4 84.96
## 5 87.22
## 6 85.07
## 7 85.82
## 8 84.33
## 9 93.23
## 10 86.57
cv_results$Accuracies <- as.numeric(cv_results$Accuracies)
mean_accuracy <- mean(cv_results$Accuracies, na.rm = TRUE)
mean_accuracy
## [1] 86.839
# Binomial Logit Model - K fold validation
Following code implements the k-fold validation with 10 folds on the both model and the average accuracy obtained is 88%.
# Binomial Both Logit Model - K fold validaton
# Set the number of folds (K)
num_folds <- 10
# Create an index vector for splitting
set.seed(123) # for reproducibility
indices <- createFolds(insurance.data$insuranceclaim, k = num_folds, list = TRUE)
# Initialize a variable to store cross-validation results
cv_results <- data.frame(Accuracies = double(num_folds))
# Perform K-Fold Cross-Validation
for (i in 1:num_folds) {
# Split the data into training and testing sets
train_data <- insurance.data[-indices[[i]], ]
test_data <- insurance.data[indices[[i]], ]
# Fit our model on the training data
model <- step(full.logit,
direction="both",trace=0, data = train_data)
# Make predictions on the test data
predictions <- predict(model, newdata = test_data, type="response")
(table.full <- table(predictions > 0.5, test_data$insuranceclaim))
acc <- round((sum(diag(table.full))/sum(table.full))*100,2)
# Store the RMSE in the results dataframe
cv_results$Accuracies[i] <- acc
}
# Display cross-validation results
print(cv_results)
## Accuracies
## 1 86.47
## 2 88.06
## 3 88.15
## 4 85.71
## 5 88.72
## 6 86.57
## 7 88.81
## 8 85.82
## 9 93.23
## 10 88.81
cv_results$Accuracies <- as.numeric(cv_results$Accuracies)
mean_accuracy <- mean(cv_results$Accuracies, na.rm = TRUE)
mean_accuracy
## [1] 88.035
# Binomial Logit Model - K fold validation
The probit link function is an alternative link function.
Starting with the standard normal c.d.f \(\phi(z)\) which lies in the interval \([0, 1]\), the probit (or inverse normal c.d.f.) link assumes that
\[ \phi^{-1}(\pi_i) = \eta_i \] {#eq-probit1}
so that
\[ \pi_i = \Phi(\eta_i) \] {#eq-probit2}
where \(\eta_i\) is given by @eq-binarysys as
\[ \eta_i = \beta_0 + \sum_{j=1}^p \beta_j X_{i,j} = \mathbf{x}_i' \boldsymbol{\beta}. \] Null Hypothesis (\(H_0\)): \[ H_0: \beta_j = 0 \]
The null hypothesis asserts that there is no association between the independent variable \(X_j\) and the probability of the dependent variable being in the “success” category.
Alternative Hypothesis (\(H_1\)): \[ H_1: \beta_j \neq 0 \]
The alternative hypothesis suggests that the independent variable \(X_j\) does have a significant association with the probability of the event.
# Probit Full Model
full.probit <- glm(insuranceclaim ~ . ,data = insurance.data.train ,
family = binomial(link = "probit"))
summary(full.probit)
##
## Call:
## glm(formula = insuranceclaim ~ ., family = binomial(link = "probit"),
## data = insurance.data.train)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -4.012e+00 3.591e-01 -11.173 < 2e-16 ***
## age 1.821e-02 4.694e-03 3.879 0.000105 ***
## sex1 3.719e-02 1.028e-01 0.362 0.717537
## bmi 1.458e-01 1.108e-02 13.158 < 2e-16 ***
## children1 -1.232e+00 1.325e-01 -9.295 < 2e-16 ***
## children2 -1.923e+00 1.561e-01 -12.317 < 2e-16 ***
## children3 -2.571e+00 2.026e-01 -12.692 < 2e-16 ***
## children4 -2.623e+00 3.826e-01 -6.857 7.02e-12 ***
## children5 -2.107e+00 4.526e-01 -4.654 3.25e-06 ***
## smoker1 2.162e+00 2.518e-01 8.585 < 2e-16 ***
## region1 -2.164e-01 1.460e-01 -1.482 0.138392
## region2 -1.982e-01 1.496e-01 -1.325 0.185238
## region3 -1.820e-01 1.475e-01 -1.234 0.217249
## charges 6.872e-06 9.576e-06 0.718 0.472970
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1451.15 on 1068 degrees of freedom
## Residual deviance: 765.01 on 1055 degrees of freedom
## AIC: 793.01
##
## Number of Fisher Scoring iterations: 7
Intercept ((Intercept)): The coefficient is -4.012, indicating the log-odds of the outcome when all predictors are zero. It is significantly negative, suggesting a lower likelihood of making an insurance claim.
Age (age): A one-unit increase in age is associated with an increase in the log-odds of making an insurance claim by 0.01821. This effect is statistically significant.
Sex (sex1): The coefficient is not statistically significant at the 0.05 significance level, suggesting that gender may not be a significant predictor of insurance claims.
BMI (bmi): A one-unit increase in BMI is associated with an increase in the log-odds of making an insurance claim by 0.1458. This effect is statistically significant.
Children (children1, children2, children3, children4, children5): The number of children has a significant negative impact on the log-odds of making an insurance claim.
Smoker (smoker1): Being a smoker is associated with an increase in the log-odds of making an insurance claim by 2.162. This effect is highly significant.
Region (region1, region2, region3): The coefficients for regions are not statistically significant, suggesting that region may not be a significant predictor of insurance claims.
Charges (charges): The coefficient is not statistically significant at the 0.05 significance level, indicating that charges may not be a significant predictor of insurance claims.
car::qqPlot(residuals(full.probit), main = NA, pch = 19, col = 2, cex = 0.7)
## 1227 681
## 982 554
shapiro.test(residuals(full.probit))
##
## Shapiro-Wilk normality test
##
## data: residuals(full.probit)
## W = 0.97872, p-value = 2.062e-11
From the resiudal plot we can ibserve that the data points are not normal which we can also proven statistically by shapiro-wilk test.
Again, a model with smaller BIC is better which is full.logit model in comparison to the both logit model.
Following are the train and test accuracies
#train data accuracy
pred.both <- predict(full.probit, newdata = insurance.data.train, type="response")
(table.both <- table(pred.both > 0.5, insurance.data.train$insuranceclaim))
##
## 0 1
## FALSE 364 56
## TRUE 80 569
(accuracy.both.forward <- round((sum(diag(table.both.forward))/sum(table.both.forward))*100,2))
## [1] 88.06
#test data accuracy
pred.both <- predict(full.probit, newdata = insurance.data.test, type="response")
(table.both <- table(pred.both > 0.5, insurance.data.test$insuranceclaim))
##
## 0 1
## FALSE 87 17
## TRUE 24 140
(accuracy.both.forward <- round((sum(diag(table.both.forward))/sum(table.both.forward))*100,2))
## [1] 88.06
Test data accuracy of the models are similar to the logit model. Also, both the train and test data accuracy is same
Following is the code to fit the probit model on the reduced predictors such as age, bmi, children, smoker
# Probit Model
full.predictors.probit <- glm(insuranceclaim ~ age + bmi + children + smoker ,data = insurance.data.train ,
family = binomial(link = "probit"))
summary(full.predictors.probit)
##
## Call:
## glm(formula = insuranceclaim ~ age + bmi + children + smoker,
## family = binomial(link = "probit"), data = insurance.data.train)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -4.124238 0.344506 -11.971 < 2e-16 ***
## age 0.020223 0.003899 5.187 2.14e-07 ***
## bmi 0.144275 0.010687 13.500 < 2e-16 ***
## children1 -1.232168 0.132227 -9.319 < 2e-16 ***
## children2 -1.912556 0.155338 -12.312 < 2e-16 ***
## children3 -2.568484 0.201282 -12.761 < 2e-16 ***
## children4 -2.583550 0.380364 -6.792 1.10e-11 ***
## children5 -2.092116 0.452774 -4.621 3.83e-06 ***
## smoker1 2.279723 0.191072 11.931 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1451.15 on 1068 degrees of freedom
## Residual deviance: 768.43 on 1060 degrees of freedom
## AIC: 786.43
##
## Number of Fisher Scoring iterations: 7
The logistic regression model with link is probit provides insights into the factors influencing the likelihood of making an insurance claim. The intercept, set at -4.124238, represents the baseline log-odds when all other predictors are zero. As individuals age, the log-odds of making a claim increase by 0.020223 for each additional year. Similarly, a rise in BMI corresponds to a log-odds increase of 0.144275 for making an insurance claim.
The presence of children plays a notable role. Having one, two, three, four, or five children results in log-odds reductions of -1.232168, -1.912556, -2.568484, -2.583550, and -2.092116, respectively, in the likelihood of making a claim. In other words, the number of children inversely affects the probability of an insurance claim.
On the other hand, being a smoker significantly elevates the log-odds by 2.279723, indicating a substantial increase in the likelihood of making an insurance claim for smokers. These insights provide a nuanced understanding of how different factors contribute to the complex dynamics of insurance claim predictions.
car::qqPlot(residuals(full.predictors.probit), main = NA, pch = 19, col = 2, cex = 0.7)
## 1227 429
## 982 347
shapiro.test(residuals(full.predictors.probit))
##
## Shapiro-Wilk normality test
##
## data: residuals(full.predictors.probit)
## W = 0.97837, p-value = 1.574e-11
Above plot illustrates that the data points are not normal as there are deviations from the normal line.
pred.both.train.probit <- predict(full.predictors.probit, newdata = insurance.data.train, type="response")
(table.both <- table(pred.both.train.probit > 0.5, insurance.data.train$insuranceclaim))
##
## 0 1
## FALSE 366 57
## TRUE 78 568
(accuracy.both.forward <- round((sum(diag(table.both.forward))/sum(table.both.forward))*100,2))
## [1] 88.06
pred.both.test.probit <- predict(full.predictors.probit, newdata = insurance.data.test, type="response")
(table.both <- table(pred.both.test.probit > 0.5, insurance.data.test$insuranceclaim))
##
## 0 1
## FALSE 88 15
## TRUE 23 142
(accuracy.both.forward <- round((sum(diag(table.both.forward))/sum(table.both.forward))*100,2))
## [1] 88.06
The classification model demonstrates an accuracy of 88.06%, signifying the proportion of accurate predictions relative to the total predictions. Notably, there are 142 instances of true positives, denoting cases where the model correctly predicted class 1, and 88 instances of true negatives, indicating accurate predictions for class 0. On the other hand, the model incurred 15 false positives, where it predicted class 1, but the actual class was 0, and 23 false negatives, representing cases where the model predicted class 0, but the true class was 1.
par(mfrow = c(1,2))
# Training ROC curve
roc.both.train <- roc(insurance.data.train$insuranceclaim ~ pred.both.train.probit, plot = TRUE,
legacy.axes = TRUE, print.auc = TRUE, main = "Training ROC Curve")
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
# Testing ROC curve
roc.both.test <- roc(insurance.data.test$insuranceclaim ~ pred.both.test.probit, plot = TRUE,
legacy.axes = TRUE, print.auc = TRUE, main = "Testing ROC Curve")
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
# Binomial Full Probit Model - K fold validaton
# Set the number of folds (K)
num_folds <- 10
# Create an index vector for splitting
set.seed(123) # for reproducibility
indices <- createFolds(insurance.data$insuranceclaim, k = num_folds, list = TRUE)
# Initialize a variable to store cross-validation results
cv_results <- data.frame(Accuracies = double(num_folds))
# Perform K-Fold Cross-Validation
for (i in 1:num_folds) {
# Split the data into training and testing sets
train_data <- insurance.data[-indices[[i]], ]
test_data <- insurance.data[indices[[i]], ]
# Fit our model on the training data
model <- glm(insuranceclaim ~ . ,data = train_data,
family = binomial(link = "probit"))
# Make predictions on the test data
predictions <- predict(model, newdata = test_data, type="response")
(table.full <- table(predictions > 0.5, test_data$insuranceclaim))
acc <- round((sum(diag(table.full))/sum(table.full))*100,2)
# Store the RMSE in the results dataframe
cv_results$Accuracies[i] <- acc
}
# Display cross-validation results
print(cv_results)
## Accuracies
## 1 86.47
## 2 86.57
## 3 86.67
## 4 84.21
## 5 87.22
## 6 84.33
## 7 85.07
## 8 84.33
## 9 92.48
## 10 85.82
cv_results$Accuracies <- as.numeric(cv_results$Accuracies)
mean_accuracy <- mean(cv_results$Accuracies, na.rm = TRUE)
mean_accuracy
## [1] 86.317
# Binomial Logit Model - K fold validation
The model demonstrates relatively consistent performance across different folds, with accuracy ranging from 84.33% to 91.04%. This suggests that the model is not heavily dependent on a specific subset of the data and maintains its predictive capability across various scenarios.The mean accuracy of approximately 87.30% indicates that, on average, the model correctly predicts whether an individual will make an insurance claim in about 87.30% of cases. This suggests a reasonably effective predictive performance.
We define two impurity measures, Gini index and entropy, for classifying a response with \(J\) categories.
The Gini index is defined by
\[ \mbox{Gini index} = 1 - \sum_{j=1}^J p^2_j, \] {#eq-gini-index}
where \(p_j = P(Y \in \mbox{class} j),~j=1,\ldots,J\). Gini index lies in \([0, 1]\). The value \(0\) denotes a pure classification where all the cases belong to a single class, while \(1\) indicates a random distribution of cases across the \(J\) classes. A Gini index of \(0.5\) shows an equal distribution of cases over some classes.
Entropy is an alternate impurity measure which also lies in \([0,1]\):
\[ \mbox{Entropy} = \sum_{j=1}^J-p_j \log_2 (p_j). \] {#eq-entropy}
The rpart package uses the Gini index as the impurity index and minimizes a cost
\[ \text{Cost}_{CP}(\mbox{Tree}) = \mbox{Error(Tree)} + Cp \ \mathcal{N}(\mbox{Tree}), \]
where, Error(Tree) is the fraction of misclassified cases and \(\mathcal{N}(\mbox{Tree})\) is the number of leaf nodes in the tree.
# Classification and Regression Trees
insurance.data.dup <- read.csv("~/Documents/GitHub/GitHub/Insurance-Claim-Prediction/data/insurance.csv")
insurance.data <- insurance.data.dup
set.seed(12345)
train.prop <- 0.80
strats <- insurance.data$insuranceclaim
rr <- split(1:length(strats), strats)
idx <- sort(as.numeric(unlist(sapply(rr,
function(x) sample(x, length(x)*train.prop)))))
insurance.data.train <- insurance.data[idx, ]
insurance.data.test <- insurance.data[-idx, ]
table(insurance.data.train$insuranceclaim)/nrow(insurance.data.train)
##
## 0 1
## 0.4149533 0.5850467
fit.allp <- rpart(insuranceclaim ~., method = "class", data = insurance.data.train,
control = rpart.control(minsplit = 1, cp = 0.001))
summary(fit.allp)
## Call:
## rpart(formula = insuranceclaim ~ ., data = insurance.data.train,
## method = "class", control = rpart.control(minsplit = 1, cp = 0.001))
## n= 1070
##
## CP nsplit rel error xerror xstd
## 1 0.315315315 0 1.000000000 1.00000000 0.03629976
## 2 0.128378378 1 0.684684685 0.71846847 0.03370082
## 3 0.051801802 3 0.427927928 0.43693694 0.02838429
## 4 0.043918919 4 0.376126126 0.42792793 0.02815421
## 5 0.033783784 6 0.288288288 0.32432432 0.02514270
## 6 0.027027027 7 0.254504505 0.29954955 0.02430641
## 7 0.020270270 8 0.227477477 0.25675676 0.02273037
## 8 0.018018018 10 0.186936937 0.23648649 0.02191712
## 9 0.016891892 11 0.168918919 0.19594595 0.02013546
## 10 0.015765766 13 0.135135135 0.19594595 0.02013546
## 11 0.013513514 14 0.119369369 0.17117117 0.01892453
## 12 0.012762763 15 0.105855856 0.16891892 0.01880907
## 13 0.009009009 18 0.067567568 0.10585586 0.01509774
## 14 0.006756757 19 0.058558559 0.10360360 0.01494356
## 15 0.005630631 22 0.038288288 0.08333333 0.01346096
## 16 0.004504505 24 0.027027027 0.08333333 0.01346096
## 17 0.002252252 27 0.013513514 0.05630631 0.01112893
## 18 0.001126126 30 0.006756757 0.05405405 0.01090929
## 19 0.001000000 36 0.000000000 0.05180180 0.01068470
##
## Variable importance
## bmi children charges smoker age region
## 31 26 20 12 10 1
##
## Node number 1: 1070 observations, complexity param=0.3153153
## predicted class=1 expected loss=0.4149533 P(node) =1
## class counts: 444 626
## probabilities: 0.415 0.585
## left son=2 (244 obs) right son=3 (826 obs)
## Primary splits:
## bmi < 25.9825 to the left, improve=87.44814, (0 missing)
## children < 0.5 to the right, improve=82.55445, (0 missing)
## smoker < 0.5 to the left, improve=57.01775, (0 missing)
## charges < 33047.5 to the left, improve=45.12917, (0 missing)
## age < 41.5 to the left, improve=16.60421, (0 missing)
##
## Node number 2: 244 observations, complexity param=0.0518018
## predicted class=0 expected loss=0.2131148 P(node) =0.2280374
## class counts: 192 52
## probabilities: 0.787 0.213
## left son=4 (191 obs) right son=5 (53 obs)
## Primary splits:
## smoker < 0.5 to the left, improve=34.378990, (0 missing)
## charges < 14511.86 to the left, improve=26.570560, (0 missing)
## bmi < 17.575 to the right, improve= 7.247083, (0 missing)
## age < 63.5 to the left, improve= 2.344399, (0 missing)
## region < 2.5 to the right, improve= 0.787562, (0 missing)
## Surrogate splits:
## charges < 14511.86 to the left, agree=0.939, adj=0.717, (0 split)
## bmi < 25.845 to the left, agree=0.787, adj=0.019, (0 split)
##
## Node number 3: 826 observations, complexity param=0.1283784
## predicted class=1 expected loss=0.3050847 P(node) =0.7719626
## class counts: 252 574
## probabilities: 0.305 0.695
## left son=6 (473 obs) right son=7 (353 obs)
## Primary splits:
## children < 0.5 to the right, improve=114.753100, (0 missing)
## smoker < 0.5 to the left, improve= 30.605070, (0 missing)
## charges < 33047.5 to the left, improve= 24.727040, (0 missing)
## age < 41.5 to the left, improve= 13.870640, (0 missing)
## bmi < 31.01 to the left, improve= 9.525539, (0 missing)
## Surrogate splits:
## charges < 3220.372 to the right, agree=0.662, adj=0.210, (0 split)
## age < 25.5 to the right, agree=0.646, adj=0.173, (0 split)
## bmi < 26.6475 to the right, agree=0.579, adj=0.014, (0 split)
##
## Node number 4: 191 observations, complexity param=0.01576577
## predicted class=0 expected loss=0.07329843 P(node) =0.1785047
## class counts: 177 14
## probabilities: 0.927 0.073
## left son=8 (184 obs) right son=9 (7 obs)
## Primary splits:
## bmi < 17.575 to the right, improve=12.4802500, (0 missing)
## charges < 30225.63 to the left, improve= 1.7265910, (0 missing)
## children < 1.5 to the left, improve= 0.8043285, (0 missing)
## age < 63.5 to the left, improve= 0.7360038, (0 missing)
## region < 0.5 to the right, improve= 0.1247432, (0 missing)
##
## Node number 5: 53 observations, complexity param=0.01689189
## predicted class=1 expected loss=0.2830189 P(node) =0.04953271
## class counts: 15 38
## probabilities: 0.283 0.717
## left son=10 (23 obs) right son=11 (30 obs)
## Primary splits:
## children < 1.5 to the right, improve=11.074650, (0 missing)
## age < 41 to the left, improve= 6.509434, (0 missing)
## charges < 19479.9 to the left, improve= 4.805730, (0 missing)
## bmi < 19.1975 to the left, improve= 4.448209, (0 missing)
## region < 2.5 to the right, improve= 1.610444, (0 missing)
## Surrogate splits:
## bmi < 24.265 to the right, agree=0.660, adj=0.217, (0 split)
## region < 2.5 to the right, agree=0.623, adj=0.130, (0 split)
## charges < 16717.01 to the right, agree=0.604, adj=0.087, (0 split)
## age < 29.5 to the right, agree=0.585, adj=0.043, (0 split)
##
## Node number 6: 473 observations, complexity param=0.1283784
## predicted class=0 expected loss=0.4672304 P(node) =0.4420561
## class counts: 252 221
## probabilities: 0.533 0.467
## left son=12 (378 obs) right son=13 (95 obs)
## Primary splits:
## smoker < 0.5 to the left, improve=52.43251, (0 missing)
## charges < 30124.26 to the left, improve=45.66116, (0 missing)
## age < 40.5 to the left, improve=26.26169, (0 missing)
## bmi < 31.1925 to the left, improve=15.70518, (0 missing)
## children < 2.5 to the right, improve=14.09699, (0 missing)
## Surrogate splits:
## charges < 30124.26 to the left, agree=0.941, adj=0.705, (0 split)
## bmi < 26.1525 to the right, agree=0.801, adj=0.011, (0 split)
##
## Node number 7: 353 observations
## predicted class=1 expected loss=0 P(node) =0.3299065
## class counts: 0 353
## probabilities: 0.000 1.000
##
## Node number 8: 184 observations, complexity param=0.004504505
## predicted class=0 expected loss=0.03804348 P(node) =0.1719626
## class counts: 177 7
## probabilities: 0.962 0.038
## left son=16 (157 obs) right son=17 (27 obs)
## Primary splits:
## bmi < 25.3325 to the left, improve=2.14679700, (0 missing)
## charges < 30225.63 to the left, improve=1.86083400, (0 missing)
## children < 1.5 to the left, improve=1.28220600, (0 missing)
## age < 63.5 to the left, improve=0.86299570, (0 missing)
## region < 2.5 to the left, improve=0.08394667, (0 missing)
## Surrogate splits:
## charges < 28149.52 to the left, agree=0.859, adj=0.037, (0 split)
##
## Node number 9: 7 observations
## predicted class=1 expected loss=0 P(node) =0.006542056
## class counts: 0 7
## probabilities: 0.000 1.000
##
## Node number 10: 23 observations, complexity param=0.01689189
## predicted class=0 expected loss=0.3478261 P(node) =0.02149533
## class counts: 15 8
## probabilities: 0.652 0.348
## left son=20 (15 obs) right son=21 (8 obs)
## Primary splits:
## age < 41.5 to the left, improve=10.4347800, (0 missing)
## charges < 19621.16 to the left, improve= 7.2347830, (0 missing)
## bmi < 24.56 to the left, improve= 2.2501670, (0 missing)
## children < 2.5 to the right, improve= 0.7732441, (0 missing)
## region < 2.5 to the right, improve= 0.5328218, (0 missing)
## Surrogate splits:
## charges < 19621.16 to the left, agree=0.913, adj=0.75, (0 split)
## bmi < 24.56 to the left, agree=0.739, adj=0.25, (0 split)
##
## Node number 11: 30 observations
## predicted class=1 expected loss=0 P(node) =0.02803738
## class counts: 0 30
## probabilities: 0.000 1.000
##
## Node number 12: 378 observations, complexity param=0.04391892
## predicted class=0 expected loss=0.3492063 P(node) =0.353271
## class counts: 246 132
## probabilities: 0.651 0.349
## left son=24 (187 obs) right son=25 (191 obs)
## Primary splits:
## age < 40.5 to the left, improve=27.8931800, (0 missing)
## charges < 7146.168 to the left, improve=17.2151500, (0 missing)
## children < 1.5 to the right, improve=13.6844100, (0 missing)
## bmi < 31.1925 to the left, improve=12.0039700, (0 missing)
## region < 0.5 to the right, improve= 0.5812166, (0 missing)
## Surrogate splits:
## charges < 6659.237 to the left, agree=0.897, adj=0.791, (0 split)
## bmi < 31.3025 to the left, agree=0.566, adj=0.123, (0 split)
## sex < 0.5 to the right, agree=0.534, adj=0.059, (0 split)
## region < 0.5 to the right, agree=0.526, adj=0.043, (0 split)
## children < 3.5 to the right, agree=0.516, adj=0.021, (0 split)
##
## Node number 13: 95 observations, complexity param=0.005630631
## predicted class=1 expected loss=0.06315789 P(node) =0.08878505
## class counts: 6 89
## probabilities: 0.063 0.937
## left son=26 (15 obs) right son=27 (80 obs)
## Primary splits:
## charges < 21902.02 to the left, improve=2.6004390, (0 missing)
## bmi < 29.765 to the left, improve=2.2421050, (0 missing)
## children < 3.5 to the right, improve=1.7740200, (0 missing)
## age < 35.5 to the left, improve=0.8363513, (0 missing)
## region < 1.5 to the left, improve=0.1132164, (0 missing)
## Surrogate splits:
## bmi < 29.765 to the left, agree=0.905, adj=0.400, (0 split)
## age < 21 to the left, agree=0.863, adj=0.133, (0 split)
##
## Node number 16: 157 observations, complexity param=0.001126126
## predicted class=0 expected loss=0.006369427 P(node) =0.146729
## class counts: 156 1
## probabilities: 0.994 0.006
## left son=32 (147 obs) right son=33 (10 obs)
## Primary splits:
## bmi < 25.1275 to the left, improve=0.18726110, (0 missing)
## children < 1.5 to the left, improve=0.03073941, (0 missing)
## region < 0.5 to the right, improve=0.02981434, (0 missing)
## charges < 9030.432 to the left, improve=0.02362478, (0 missing)
## age < 44.5 to the left, improve=0.02004803, (0 missing)
##
## Node number 17: 27 observations, complexity param=0.004504505
## predicted class=0 expected loss=0.2222222 P(node) =0.02523364
## class counts: 21 6
## probabilities: 0.778 0.222
## left son=34 (19 obs) right son=35 (8 obs)
## Primary splits:
## children < 1.5 to the left, improve=6.3333330, (0 missing)
## charges < 9016.941 to the left, improve=2.4761900, (0 missing)
## age < 44.5 to the left, improve=1.8333330, (0 missing)
## bmi < 25.3825 to the right, improve=1.3333330, (0 missing)
## region < 1.5 to the left, improve=0.5333333, (0 missing)
## Surrogate splits:
## bmi < 25.3825 to the right, agree=0.741, adj=0.125, (0 split)
## charges < 12385.87 to the left, agree=0.741, adj=0.125, (0 split)
##
## Node number 20: 15 observations
## predicted class=0 expected loss=0 P(node) =0.01401869
## class counts: 15 0
## probabilities: 1.000 0.000
##
## Node number 21: 8 observations
## predicted class=1 expected loss=0 P(node) =0.007476636
## class counts: 0 8
## probabilities: 0.000 1.000
##
## Node number 24: 187 observations, complexity param=0.01351351
## predicted class=0 expected loss=0.1550802 P(node) =0.1747664
## class counts: 158 29
## probabilities: 0.845 0.155
## left son=48 (169 obs) right son=49 (18 obs)
## Primary splits:
## bmi < 40.1375 to the left, improve=10.42547000, (0 missing)
## children < 1.5 to the right, improve= 8.28593800, (0 missing)
## charges < 4170.071 to the right, improve= 3.18811300, (0 missing)
## age < 27.5 to the right, improve= 0.66374980, (0 missing)
## region < 2.5 to the left, improve= 0.04327544, (0 missing)
##
## Node number 25: 191 observations, complexity param=0.04391892
## predicted class=1 expected loss=0.460733 P(node) =0.1785047
## class counts: 88 103
## probabilities: 0.461 0.539
## left son=50 (56 obs) right son=51 (135 obs)
## Primary splits:
## children < 2.5 to the right, improve=10.187190, (0 missing)
## bmi < 34.49 to the left, improve= 7.423943, (0 missing)
## charges < 11881.66 to the right, improve= 2.994786, (0 missing)
## age < 53.5 to the right, improve= 1.934244, (0 missing)
## region < 1.5 to the left, improve= 1.254331, (0 missing)
## Surrogate splits:
## age < 62.5 to the right, agree=0.723, adj=0.054, (0 split)
##
## Node number 26: 15 observations, complexity param=0.005630631
## predicted class=1 expected loss=0.3333333 P(node) =0.01401869
## class counts: 5 10
## probabilities: 0.333 0.667
## left son=52 (5 obs) right son=53 (10 obs)
## Primary splits:
## children < 1.5 to the right, improve=6.6666670, (0 missing)
## charges < 19826.58 to the right, improve=1.4880950, (0 missing)
## bmi < 29.315 to the right, improve=0.9523810, (0 missing)
## age < 35.5 to the left, improve=0.5128205, (0 missing)
## region < 2.5 to the right, improve=0.1282051, (0 missing)
## Surrogate splits:
## bmi < 28.535 to the right, agree=0.733, adj=0.2, (0 split)
## charges < 19826.58 to the right, agree=0.733, adj=0.2, (0 split)
##
## Node number 27: 80 observations, complexity param=0.001126126
## predicted class=1 expected loss=0.0125 P(node) =0.07476636
## class counts: 1 79
## probabilities: 0.013 0.988
## left son=54 (9 obs) right son=55 (71 obs)
## Primary splits:
## bmi < 29.07 to the left, improve=0.19722220, (0 missing)
## charges < 33545.31 to the left, improve=0.14166670, (0 missing)
## children < 2.5 to the right, improve=0.07023810, (0 missing)
## age < 42.5 to the left, improve=0.03382353, (0 missing)
## region < 1.5 to the left, improve=0.02905405, (0 missing)
## Surrogate splits:
## charges < 29526.8 to the left, agree=0.975, adj=0.778, (0 split)
##
## Node number 32: 147 observations
## predicted class=0 expected loss=0 P(node) =0.1373832
## class counts: 147 0
## probabilities: 1.000 0.000
##
## Node number 33: 10 observations, complexity param=0.001126126
## predicted class=0 expected loss=0.1 P(node) =0.009345794
## class counts: 9 1
## probabilities: 0.900 0.100
## left son=66 (9 obs) right son=67 (1 obs)
## Primary splits:
## children < 1.5 to the left, improve=1.8000000, (0 missing)
## region < 0.5 to the right, improve=0.4666667, (0 missing)
## charges < 8768.868 to the left, improve=0.3000000, (0 missing)
## age < 43.5 to the left, improve=0.2000000, (0 missing)
## sex < 0.5 to the right, improve=0.2000000, (0 missing)
##
## Node number 34: 19 observations
## predicted class=0 expected loss=0 P(node) =0.01775701
## class counts: 19 0
## probabilities: 1.000 0.000
##
## Node number 35: 8 observations, complexity param=0.004504505
## predicted class=1 expected loss=0.25 P(node) =0.007476636
## class counts: 2 6
## probabilities: 0.250 0.750
## left son=70 (2 obs) right son=71 (6 obs)
## Primary splits:
## age < 38 to the left, improve=3.0000000, (0 missing)
## charges < 7018.252 to the left, improve=3.0000000, (0 missing)
## bmi < 25.7425 to the right, improve=1.6666670, (0 missing)
## children < 2.5 to the left, improve=0.6000000, (0 missing)
## region < 0.5 to the left, improve=0.3333333, (0 missing)
## Surrogate splits:
## charges < 7018.252 to the left, agree=1.000, adj=1.0, (0 split)
## bmi < 25.7425 to the right, agree=0.875, adj=0.5, (0 split)
##
## Node number 48: 169 observations, complexity param=0.01276276
## predicted class=0 expected loss=0.1005917 P(node) =0.1579439
## class counts: 152 17
## probabilities: 0.899 0.101
## left son=96 (99 obs) right son=97 (70 obs)
## Primary splits:
## children < 1.5 to the right, improve=4.8370250, (0 missing)
## bmi < 31.1925 to the left, improve=4.0863750, (0 missing)
## charges < 4170.071 to the right, improve=2.2165900, (0 missing)
## age < 32.5 to the right, improve=0.8416554, (0 missing)
## region < 1.5 to the right, improve=0.2004161, (0 missing)
## Surrogate splits:
## charges < 3393.167 to the right, agree=0.692, adj=0.257, (0 split)
## bmi < 28.4525 to the right, agree=0.598, adj=0.029, (0 split)
## age < 18.5 to the right, agree=0.592, adj=0.014, (0 split)
##
## Node number 49: 18 observations, complexity param=0.006756757
## predicted class=1 expected loss=0.3333333 P(node) =0.01682243
## class counts: 6 12
## probabilities: 0.333 0.667
## left son=98 (9 obs) right son=99 (9 obs)
## Primary splits:
## children < 1.5 to the right, improve=4.0000000, (0 missing)
## age < 28.5 to the right, improve=1.5384620, (0 missing)
## bmi < 45.06 to the left, improve=1.5384620, (0 missing)
## charges < 3979.797 to the right, improve=1.5384620, (0 missing)
## region < 0.5 to the right, improve=0.2352941, (0 missing)
## Surrogate splits:
## bmi < 42.92 to the right, agree=0.778, adj=0.556, (0 split)
## age < 26.5 to the right, agree=0.722, adj=0.444, (0 split)
## charges < 3551.876 to the right, agree=0.722, adj=0.444, (0 split)
## region < 1.5 to the right, agree=0.611, adj=0.222, (0 split)
## sex < 0.5 to the right, agree=0.556, adj=0.111, (0 split)
##
## Node number 50: 56 observations, complexity param=0.03378378
## predicted class=0 expected loss=0.2857143 P(node) =0.05233645
## class counts: 40 16
## probabilities: 0.714 0.286
## left son=100 (41 obs) right son=101 (15 obs)
## Primary splits:
## bmi < 29.9725 to the right, improve=20.9059200, (0 missing)
## age < 53.5 to the right, improve= 2.7501910, (0 missing)
## children < 4.5 to the left, improve= 2.1164020, (0 missing)
## charges < 11258.98 to the right, improve= 1.4404760, (0 missing)
## region < 0.5 to the right, improve= 0.3180124, (0 missing)
##
## Node number 51: 135 observations, complexity param=0.02702703
## predicted class=1 expected loss=0.3555556 P(node) =0.1261682
## class counts: 48 87
## probabilities: 0.356 0.644
## left son=102 (78 obs) right son=103 (57 obs)
## Primary splits:
## bmi < 33.5825 to the left, improve=18.105530, (0 missing)
## region < 1.5 to the left, improve= 2.115893, (0 missing)
## charges < 7099.057 to the left, improve= 1.866667, (0 missing)
## age < 61.5 to the left, improve= 1.312821, (0 missing)
## sex < 0.5 to the left, improve= 0.533928, (0 missing)
## Surrogate splits:
## charges < 14177.35 to the left, agree=0.622, adj=0.105, (0 split)
## age < 60.5 to the left, agree=0.600, adj=0.053, (0 split)
##
## Node number 52: 5 observations
## predicted class=0 expected loss=0 P(node) =0.004672897
## class counts: 5 0
## probabilities: 1.000 0.000
##
## Node number 53: 10 observations
## predicted class=1 expected loss=0 P(node) =0.009345794
## class counts: 0 10
## probabilities: 0.000 1.000
##
## Node number 54: 9 observations, complexity param=0.001126126
## predicted class=1 expected loss=0.1111111 P(node) =0.008411215
## class counts: 1 8
## probabilities: 0.111 0.889
## left son=108 (1 obs) right son=109 (8 obs)
## Primary splits:
## charges < 30828.06 to the right, improve=1.7777780, (0 missing)
## age < 44.5 to the left, improve=0.7777778, (0 missing)
## bmi < 28.2625 to the right, improve=0.7777778, (0 missing)
## children < 2.5 to the right, improve=0.7777778, (0 missing)
## sex < 0.5 to the right, improve=0.1777778, (0 missing)
##
## Node number 55: 71 observations
## predicted class=1 expected loss=0 P(node) =0.06635514
## class counts: 0 71
## probabilities: 0.000 1.000
##
## Node number 66: 9 observations
## predicted class=0 expected loss=0 P(node) =0.008411215
## class counts: 9 0
## probabilities: 1.000 0.000
##
## Node number 67: 1 observations
## predicted class=1 expected loss=0 P(node) =0.0009345794
## class counts: 0 1
## probabilities: 0.000 1.000
##
## Node number 70: 2 observations
## predicted class=0 expected loss=0 P(node) =0.001869159
## class counts: 2 0
## probabilities: 1.000 0.000
##
## Node number 71: 6 observations
## predicted class=1 expected loss=0 P(node) =0.005607477
## class counts: 0 6
## probabilities: 0.000 1.000
##
## Node number 96: 99 observations
## predicted class=0 expected loss=0 P(node) =0.09252336
## class counts: 99 0
## probabilities: 1.000 0.000
##
## Node number 97: 70 observations, complexity param=0.01276276
## predicted class=0 expected loss=0.2428571 P(node) =0.06542056
## class counts: 53 17
## probabilities: 0.757 0.243
## left son=194 (41 obs) right son=195 (29 obs)
## Primary splits:
## bmi < 31.1275 to the left, improve=11.6738900, (0 missing)
## age < 32.5 to the right, improve= 1.4814940, (0 missing)
## charges < 4194.078 to the right, improve= 1.2000940, (0 missing)
## region < 1.5 to the right, improve= 1.0296530, (0 missing)
## sex < 0.5 to the right, improve= 0.3474323, (0 missing)
## Surrogate splits:
## charges < 4194.078 to the right, agree=0.671, adj=0.207, (0 split)
## age < 20.5 to the right, agree=0.629, adj=0.103, (0 split)
##
## Node number 98: 9 observations, complexity param=0.006756757
## predicted class=0 expected loss=0.3333333 P(node) =0.008411215
## class counts: 6 3
## probabilities: 0.667 0.333
## left son=196 (6 obs) right son=197 (3 obs)
## Primary splits:
## bmi < 45.06 to the left, improve=4.00, (0 missing)
## age < 28.5 to the right, improve=1.00, (0 missing)
## sex < 0.5 to the left, improve=1.00, (0 missing)
## charges < 3979.797 to the right, improve=1.00, (0 missing)
## children < 4 to the right, improve=0.25, (0 missing)
## Surrogate splits:
## charges < 4720.013 to the right, agree=0.778, adj=0.333, (0 split)
##
## Node number 99: 9 observations
## predicted class=1 expected loss=0 P(node) =0.008411215
## class counts: 0 9
## probabilities: 0.000 1.000
##
## Node number 100: 41 observations, complexity param=0.002252252
## predicted class=0 expected loss=0.02439024 P(node) =0.03831776
## class counts: 40 1
## probabilities: 0.976 0.024
## left son=200 (40 obs) right son=201 (1 obs)
## Primary splits:
## bmi < 45.725 to the left, improve=1.95122000, (0 missing)
## children < 4.5 to the left, improve=1.95122000, (0 missing)
## charges < 12543.91 to the left, improve=0.06886657, (0 missing)
## age < 52.5 to the right, improve=0.05648267, (0 missing)
## region < 1.5 to the left, improve=0.03817603, (0 missing)
##
## Node number 101: 15 observations
## predicted class=1 expected loss=0 P(node) =0.01401869
## class counts: 0 15
## probabilities: 0.000 1.000
##
## Node number 102: 78 observations, complexity param=0.02027027
## predicted class=0 expected loss=0.4230769 P(node) =0.0728972
## class counts: 45 33
## probabilities: 0.577 0.423
## left son=204 (31 obs) right son=205 (47 obs)
## Primary splits:
## bmi < 29.34 to the left, improve=2.8017000, (0 missing)
## charges < 7347.962 to the left, improve=2.3945700, (0 missing)
## age < 43.5 to the left, improve=0.9086691, (0 missing)
## region < 0.5 to the right, improve=0.6350258, (0 missing)
## sex < 0.5 to the left, improve=0.2263796, (0 missing)
## Surrogate splits:
## charges < 7205.138 to the left, agree=0.654, adj=0.129, (0 split)
## age < 42.5 to the left, agree=0.615, adj=0.032, (0 split)
##
## Node number 103: 57 observations, complexity param=0.002252252
## predicted class=1 expected loss=0.05263158 P(node) =0.05327103
## class counts: 3 54
## probabilities: 0.053 0.947
## left son=206 (8 obs) right son=207 (49 obs)
## Primary splits:
## bmi < 39.9875 to the right, improve=0.7250269, (0 missing)
## children < 1.5 to the right, improve=0.4668192, (0 missing)
## charges < 8651.546 to the right, improve=0.1706970, (0 missing)
## age < 45.5 to the right, improve=0.1342105, (0 missing)
## region < 1.5 to the left, improve=0.1207185, (0 missing)
##
## Node number 108: 1 observations
## predicted class=0 expected loss=0 P(node) =0.0009345794
## class counts: 1 0
## probabilities: 1.000 0.000
##
## Node number 109: 8 observations
## predicted class=1 expected loss=0 P(node) =0.007476636
## class counts: 0 8
## probabilities: 0.000 1.000
##
## Node number 194: 41 observations
## predicted class=0 expected loss=0 P(node) =0.03831776
## class counts: 41 0
## probabilities: 1.000 0.000
##
## Node number 195: 29 observations, complexity param=0.01276276
## predicted class=1 expected loss=0.4137931 P(node) =0.0271028
## class counts: 12 17
## probabilities: 0.414 0.586
## left son=390 (12 obs) right son=391 (17 obs)
## Primary splits:
## bmi < 35 to the right, improve=14.0689700, (0 missing)
## region < 1.5 to the right, improve= 2.5003380, (0 missing)
## age < 18.5 to the left, improve= 1.4763730, (0 missing)
## charges < 18933.33 to the right, improve= 1.4763730, (0 missing)
## sex < 0.5 to the right, improve= 0.8880131, (0 missing)
## Surrogate splits:
## region < 1.5 to the right, agree=0.690, adj=0.250, (0 split)
## age < 18.5 to the left, agree=0.655, adj=0.167, (0 split)
## charges < 18933.33 to the right, agree=0.655, adj=0.167, (0 split)
## sex < 0.5 to the right, agree=0.621, adj=0.083, (0 split)
##
## Node number 196: 6 observations
## predicted class=0 expected loss=0 P(node) =0.005607477
## class counts: 6 0
## probabilities: 1.000 0.000
##
## Node number 197: 3 observations
## predicted class=1 expected loss=0 P(node) =0.002803738
## class counts: 0 3
## probabilities: 0.000 1.000
##
## Node number 200: 40 observations
## predicted class=0 expected loss=0 P(node) =0.03738318
## class counts: 40 0
## probabilities: 1.000 0.000
##
## Node number 201: 1 observations
## predicted class=1 expected loss=0 P(node) =0.0009345794
## class counts: 0 1
## probabilities: 0.000 1.000
##
## Node number 204: 31 observations, complexity param=0.01801802
## predicted class=0 expected loss=0.2580645 P(node) =0.02897196
## class counts: 23 8
## probabilities: 0.742 0.258
## left son=408 (23 obs) right son=409 (8 obs)
## Primary splits:
## children < 1.5 to the left, improve=11.8709700, (0 missing)
## bmi < 26.8875 to the right, improve= 2.4843010, (0 missing)
## charges < 7676.924 to the left, improve= 1.2043010, (0 missing)
## age < 47.5 to the right, improve= 0.9043011, (0 missing)
## region < 2.5 to the left, improve= 0.2218449, (0 missing)
## Surrogate splits:
## bmi < 26.2675 to the right, agree=0.806, adj=0.25, (0 split)
##
## Node number 205: 47 observations, complexity param=0.02027027
## predicted class=1 expected loss=0.4680851 P(node) =0.04392523
## class counts: 22 25
## probabilities: 0.468 0.532
## left son=410 (23 obs) right son=411 (24 obs)
## Primary splits:
## children < 1.5 to the right, improve=11.5455600, (0 missing)
## bmi < 32.865 to the right, improve= 1.2328270, (0 missing)
## charges < 11881.66 to the right, improve= 0.8133462, (0 missing)
## region < 1.5 to the left, improve= 0.5764165, (0 missing)
## age < 43.5 to the left, improve= 0.4747681, (0 missing)
## Surrogate splits:
## charges < 11880.23 to the right, agree=0.681, adj=0.348, (0 split)
## age < 45.5 to the left, agree=0.596, adj=0.174, (0 split)
## bmi < 29.7825 to the left, agree=0.574, adj=0.130, (0 split)
## region < 0.5 to the right, agree=0.574, adj=0.130, (0 split)
##
## Node number 206: 8 observations, complexity param=0.002252252
## predicted class=1 expected loss=0.25 P(node) =0.007476636
## class counts: 2 6
## probabilities: 0.250 0.750
## left son=412 (2 obs) right son=413 (6 obs)
## Primary splits:
## bmi < 43.19 to the left, improve=3.0000000, (0 missing)
## children < 1.5 to the right, improve=1.6666670, (0 missing)
## age < 49.5 to the left, improve=0.6000000, (0 missing)
## region < 1.5 to the left, improve=0.3333333, (0 missing)
## charges < 8651.546 to the right, improve=0.3333333, (0 missing)
## Surrogate splits:
## children < 1.5 to the right, agree=0.875, adj=0.5, (0 split)
##
## Node number 207: 49 observations, complexity param=0.001126126
## predicted class=1 expected loss=0.02040816 P(node) =0.04579439
## class counts: 1 48
## probabilities: 0.020 0.980
## left son=414 (5 obs) right son=415 (44 obs)
## Primary splits:
## bmi < 34.3075 to the left, improve=0.35918370, (0 missing)
## age < 56.5 to the right, improve=0.18140590, (0 missing)
## region < 0.5 to the left, improve=0.18140590, (0 missing)
## charges < 12935.13 to the right, improve=0.11302980, (0 missing)
## children < 1.5 to the right, improve=0.05918367, (0 missing)
##
## Node number 390: 12 observations
## predicted class=0 expected loss=0 P(node) =0.01121495
## class counts: 12 0
## probabilities: 1.000 0.000
##
## Node number 391: 17 observations
## predicted class=1 expected loss=0 P(node) =0.01588785
## class counts: 0 17
## probabilities: 0.000 1.000
##
## Node number 408: 23 observations
## predicted class=0 expected loss=0 P(node) =0.02149533
## class counts: 23 0
## probabilities: 1.000 0.000
##
## Node number 409: 8 observations
## predicted class=1 expected loss=0 P(node) =0.007476636
## class counts: 0 8
## probabilities: 0.000 1.000
##
## Node number 410: 23 observations, complexity param=0.009009009
## predicted class=0 expected loss=0.173913 P(node) =0.02149533
## class counts: 19 4
## probabilities: 0.826 0.174
## left son=820 (19 obs) right son=821 (4 obs)
## Primary splits:
## bmi < 29.9375 to the right, improve=6.60869600, (0 missing)
## charges < 30647.57 to the left, improve=1.42687700, (0 missing)
## region < 1.5 to the left, improve=0.56254180, (0 missing)
## age < 44.5 to the right, improve=0.25155280, (0 missing)
## sex < 0.5 to the left, improve=0.02408027, (0 missing)
##
## Node number 411: 24 observations, complexity param=0.006756757
## predicted class=1 expected loss=0.125 P(node) =0.02242991
## class counts: 3 21
## probabilities: 0.125 0.875
## left son=822 (3 obs) right son=823 (21 obs)
## Primary splits:
## bmi < 29.965 to the left, improve=5.2500000, (0 missing)
## charges < 9289.083 to the left, improve=0.7500000, (0 missing)
## age < 49.5 to the left, improve=0.5357143, (0 missing)
## sex < 0.5 to the left, improve=0.0472028, (0 missing)
## region < 1.5 to the left, improve=0.0472028, (0 missing)
##
## Node number 412: 2 observations
## predicted class=0 expected loss=0 P(node) =0.001869159
## class counts: 2 0
## probabilities: 1.000 0.000
##
## Node number 413: 6 observations
## predicted class=1 expected loss=0 P(node) =0.005607477
## class counts: 0 6
## probabilities: 0.000 1.000
##
## Node number 414: 5 observations, complexity param=0.001126126
## predicted class=1 expected loss=0.2 P(node) =0.004672897
## class counts: 1 4
## probabilities: 0.200 0.800
## left son=828 (1 obs) right son=829 (4 obs)
## Primary splits:
## age < 55.5 to the right, improve=1.6, (0 missing)
## bmi < 34.2525 to the right, improve=1.6, (0 missing)
## children < 1.5 to the right, improve=1.6, (0 missing)
## charges < 12024.66 to the right, improve=1.6, (0 missing)
## sex < 0.5 to the left, improve=0.6, (0 missing)
##
## Node number 415: 44 observations
## predicted class=1 expected loss=0 P(node) =0.0411215
## class counts: 0 44
## probabilities: 0.000 1.000
##
## Node number 820: 19 observations
## predicted class=0 expected loss=0 P(node) =0.01775701
## class counts: 19 0
## probabilities: 1.000 0.000
##
## Node number 821: 4 observations
## predicted class=1 expected loss=0 P(node) =0.003738318
## class counts: 0 4
## probabilities: 0.000 1.000
##
## Node number 822: 3 observations
## predicted class=0 expected loss=0 P(node) =0.002803738
## class counts: 3 0
## probabilities: 1.000 0.000
##
## Node number 823: 21 observations
## predicted class=1 expected loss=0 P(node) =0.01962617
## class counts: 0 21
## probabilities: 0.000 1.000
##
## Node number 828: 1 observations
## predicted class=0 expected loss=0 P(node) =0.0009345794
## class counts: 1 0
## probabilities: 1.000 0.000
##
## Node number 829: 4 observations
## predicted class=1 expected loss=0 P(node) =0.003738318
## class counts: 0 4
## probabilities: 0.000 1.000
The tree is split into nodes, with each node representing a decision point based on a particular variable. The tree starts with the root node (Node number 1) and branches down to subsequent nodes.
The complexity parameters (CP) help control the size of the tree to prevent overfitting. Smaller CP values result in larger trees.
The variable importance section indicates the importance of each predictor variable in making predictions. In this case, it looks like “bmi” (body mass index) is the most important variable, followed by “children,” “charges,” “smoker,” “age,” and “region.”
Each node in the tree is associated with specific information: - Node Number: Identifies the node. - Predicted Class: The predicted class (0 or 1) at that node. - Expected Loss: The expected misclassification rate at that node. - P(node): The proportion of observations in that node.
The “Primary Splits” indicate the conditions for splitting nodes. For example, “bmi < 25.9825” means that if a certain observation has a BMI less than 25.9825, it follows the left branch; otherwise, it follows the right branch.
Surrogate splits are alternative rules used when primary splits cannot be applied. They provide a backup in case the primary split is not informative.
For Node 1: - Predicted class: 1 (insurance claim) - Expected loss: 0.415 - The node is split based on various conditions, with “bmi” being the most significant.
For Node 2: - Predicted class: 0 (no insurance claim) - Expected loss: 0.213 - The node is split based on “smoker,” “charges,” “bmi,” “age,” and “region.”
For Node 3: - Predicted class: 1 (insurance claim) - Expected loss: 0.305 - The node is split based on “children,” “smoker,” “charges,” “age,” and “bmi.”
The root node error is the percent of incorrectly classified cases at the first (root) splitting node. That is,
\[ \text{Root Node Error} = \frac{\text{No. of 1's in the training dataset}}{\text{Size of the training dataset}} \] Rootnode error at the first step of the tree construction
(rootnode_err <- sum(insurance.data.train$insuranceclaim==1)/nrow(insurance.data.train))
## [1] 0.5850467
printcp(fit.allp)
##
## Classification tree:
## rpart(formula = insuranceclaim ~ ., data = insurance.data.train,
## method = "class", control = rpart.control(minsplit = 1, cp = 0.001))
##
## Variables actually used in tree construction:
## [1] age bmi charges children smoker
##
## Root node error: 444/1070 = 0.41495
##
## n= 1070
##
## CP nsplit rel error xerror xstd
## 1 0.3153153 0 1.0000000 1.000000 0.036300
## 2 0.1283784 1 0.6846847 0.718468 0.033701
## 3 0.0518018 3 0.4279279 0.436937 0.028384
## 4 0.0439189 4 0.3761261 0.427928 0.028154
## 5 0.0337838 6 0.2882883 0.324324 0.025143
## 6 0.0270270 7 0.2545045 0.299550 0.024306
## 7 0.0202703 8 0.2274775 0.256757 0.022730
## 8 0.0180180 10 0.1869369 0.236486 0.021917
## 9 0.0168919 11 0.1689189 0.195946 0.020135
## 10 0.0157658 13 0.1351351 0.195946 0.020135
## 11 0.0135135 14 0.1193694 0.171171 0.018925
## 12 0.0127628 15 0.1058559 0.168919 0.018809
## 13 0.0090090 18 0.0675676 0.105856 0.015098
## 14 0.0067568 19 0.0585586 0.103604 0.014944
## 15 0.0056306 22 0.0382883 0.083333 0.013461
## 16 0.0045045 24 0.0270270 0.083333 0.013461
## 17 0.0022523 27 0.0135135 0.056306 0.011129
## 18 0.0011261 30 0.0067568 0.054054 0.010909
## 19 0.0010000 36 0.0000000 0.051802 0.010685
The root node error is the misclassification rate at the beginning of tree construction. In this case, the error is 41.5%, indicating that 41.5% of instances are misclassified based on the initial split. CP is a tuning parameter controlling the trade-off between tree complexity and goodness of fit. The initial CP is 0.3153153, indicating a complex tree it decreased as it scaled down as the tree constructed and was fitted more with the data and the same with the rel error and xerror. We can observe the number of splits from 0 to 36.
max(fit.allp$cptable[,"nsplit"])
## [1] 36
min(fit.allp$cptable[,"nsplit"])
## [1] 0
From the complexity table (cptable) associated with the fitted tree model, the most complex version of the tree, with all possible splits, involves 36 nodes or decision points.A minimum value of 0 suggests that a model with no splits, essentially a single-node tree (just the root).
Following is the plot of X-val Relative Error vs cp Vs size of tree Plot where we can observe the line goes down from 1.0 to 0.0 indicating that the data was fitted with the model by K- validations.
plotcp(fit.allp)
(cp= fit.allp$cptable[which.min(fit.allp$cptable[, "xerror"]), "CP"])
## [1] 0.001
(xerr = fit.allp$cptable[which.min(fit.allp$cptable[, "xerror"]), "xerror"])
## [1] 0.0518018
The values suggest that the tree model with a complexity parameter of 0.001 has the minimum cross-validated error of approximately 0.0518. This model strikes a balance between complexity and accuracy, making it a reasonable choice based on cross-validated performance.
Following plot indicates the split in each level of the tree with the details of response vairables(0/1), root node error as well as the percent of data in that split. There is a condition splecified at the split.
rpart.plot(fit.allp, extra = "auto", main = "Fitted tree using CART for the Insurance data")
test_df <- data.frame(actual = insurance.data.test$insuranceclaim, pred = NA)
test_df$pred <- predict(fit.allp, newdata = insurance.data.test, type = "class")
(conf_matrix_base <- table(test_df$pred, test_df$actual)) #confusion matrix
##
## 0 1
## 0 108 3
## 1 3 154
The model correctly identified 154 cases where insurance claims were made (True Positives) and accurately predicted 108 instances of no claims (True Negatives). However, it made a small number of errors, with 3 instances of falsely predicting a claim (False Positives) and 3 instances of failing to predict an actual claim (False Negatives).
sensitivity(conf_matrix_base)
## [1] 0.972973
specificity(conf_matrix_base)
## [1] 0.9808917
(mis.rate <- conf_matrix_base[1, 2] +
conf_matrix_base[2, 1])/sum(conf_matrix_base)
## [1] 0.02238806
The sensitivity of the model, also known as recall, is approximately 97.30%. This indicates the proportion of actual insurance claims correctly identified by the model.
The specificity, measuring the proportion of no insurance claims correctly identified, is approximately 98.09%.
The overall mis classification rate, representing the proportion of incorrect predictions out of the total predictions, is approximately 2.24%.
Bellow code is to fit the model with the cp paramter to 0.0001 and the the results looks similar to the cp parameter 0.001.
#Hyper Parameter Tuning
fit.allf <- rpart(insuranceclaim ~., method = "class", data = insurance.data.train,
control = rpart.control(cp = 0.0001))
plotcp(fit.allp)
test_df <- data.frame(actual = insurance.data.test$insuranceclaim, pred = NA)
test_df$pred <- predict(fit.allp, newdata = insurance.data.test, type = "class")
(conf_matrix_base <- table(test_df$pred, test_df$actual)) #confusion matrix
##
## 0 1
## 0 108 3
## 1 3 154
sensitivity(conf_matrix_base)
## [1] 0.972973
specificity(conf_matrix_base)
## [1] 0.9808917
(mis.rate <- conf_matrix_base[1, 2] +
conf_matrix_base[2, 1])/sum(conf_matrix_base)
## [1] 0.02238806
test_df <- data.frame(actual = insurance.data.train$insuranceclaim, pred = NA)
test_df$pred <- predict(fit.allp, newdata = insurance.data.train, type = "class")
(tab <- table(test_df$pred, test_df$actual)) #confusion matrix
##
## 0 1
## 0 444 0
## 1 0 626
sum(diag(tab))/sum(tab)
## [1] 1
sensitivity(tab)
## [1] 1
specificity(tab)
## [1] 1
Bellow code is to fit the model with the cp paramter to 0.01 and the the results looks similar to the cp parameter 0.001 and 0.0001.
fit.allf <- rpart(insuranceclaim ~., method = "class", data = insurance.data.train,
control = rpart.control(cp = 0.1))
plotcp(fit.allp)
test_df <- data.frame(actual = insurance.data.test$insuranceclaim, pred = NA)
test_df$pred <- predict(fit.allp, newdata = insurance.data.test, type = "class")
(conf_matrix_base <- table(test_df$pred, test_df$actual)) #confusion matrix
##
## 0 1
## 0 108 3
## 1 3 154
sensitivity(conf_matrix_base)
## [1] 0.972973
specificity(conf_matrix_base)
## [1] 0.9808917
(mis.rate <- conf_matrix_base[1, 2] +
conf_matrix_base[2, 1])/sum(conf_matrix_base)
## [1] 0.02238806
The function prune() can be used to select a subtree of the tree obtained with rpart() if we think (by looking at the xerror estimates) that we would fit the data better by pruning.
#Prune the tree
pfit.allp <- prune(fit.allp, cp =
fit.allp$cptable[which.min(fit.allp$cptable[, "xerror"]), "CP"])
rpart.plot(pfit.allp, extra = "auto", main = "Pruned Decision Tree")
summary(pfit.allp)
## Call:
## rpart(formula = insuranceclaim ~ ., data = insurance.data.train,
## method = "class", control = rpart.control(minsplit = 1, cp = 0.001))
## n= 1070
##
## CP nsplit rel error xerror xstd
## 1 0.315315315 0 1.000000000 1.00000000 0.03629976
## 2 0.128378378 1 0.684684685 0.71846847 0.03370082
## 3 0.051801802 3 0.427927928 0.43693694 0.02838429
## 4 0.043918919 4 0.376126126 0.42792793 0.02815421
## 5 0.033783784 6 0.288288288 0.32432432 0.02514270
## 6 0.027027027 7 0.254504505 0.29954955 0.02430641
## 7 0.020270270 8 0.227477477 0.25675676 0.02273037
## 8 0.018018018 10 0.186936937 0.23648649 0.02191712
## 9 0.016891892 11 0.168918919 0.19594595 0.02013546
## 10 0.015765766 13 0.135135135 0.19594595 0.02013546
## 11 0.013513514 14 0.119369369 0.17117117 0.01892453
## 12 0.012762763 15 0.105855856 0.16891892 0.01880907
## 13 0.009009009 18 0.067567568 0.10585586 0.01509774
## 14 0.006756757 19 0.058558559 0.10360360 0.01494356
## 15 0.005630631 22 0.038288288 0.08333333 0.01346096
## 16 0.004504505 24 0.027027027 0.08333333 0.01346096
## 17 0.002252252 27 0.013513514 0.05630631 0.01112893
## 18 0.001126126 30 0.006756757 0.05405405 0.01090929
## 19 0.001000000 36 0.000000000 0.05180180 0.01068470
##
## Variable importance
## bmi children charges smoker age region
## 31 26 20 12 10 1
##
## Node number 1: 1070 observations, complexity param=0.3153153
## predicted class=1 expected loss=0.4149533 P(node) =1
## class counts: 444 626
## probabilities: 0.415 0.585
## left son=2 (244 obs) right son=3 (826 obs)
## Primary splits:
## bmi < 25.9825 to the left, improve=87.44814, (0 missing)
## children < 0.5 to the right, improve=82.55445, (0 missing)
## smoker < 0.5 to the left, improve=57.01775, (0 missing)
## charges < 33047.5 to the left, improve=45.12917, (0 missing)
## age < 41.5 to the left, improve=16.60421, (0 missing)
##
## Node number 2: 244 observations, complexity param=0.0518018
## predicted class=0 expected loss=0.2131148 P(node) =0.2280374
## class counts: 192 52
## probabilities: 0.787 0.213
## left son=4 (191 obs) right son=5 (53 obs)
## Primary splits:
## smoker < 0.5 to the left, improve=34.378990, (0 missing)
## charges < 14511.86 to the left, improve=26.570560, (0 missing)
## bmi < 17.575 to the right, improve= 7.247083, (0 missing)
## age < 63.5 to the left, improve= 2.344399, (0 missing)
## region < 2.5 to the right, improve= 0.787562, (0 missing)
## Surrogate splits:
## charges < 14511.86 to the left, agree=0.939, adj=0.717, (0 split)
## bmi < 25.845 to the left, agree=0.787, adj=0.019, (0 split)
##
## Node number 3: 826 observations, complexity param=0.1283784
## predicted class=1 expected loss=0.3050847 P(node) =0.7719626
## class counts: 252 574
## probabilities: 0.305 0.695
## left son=6 (473 obs) right son=7 (353 obs)
## Primary splits:
## children < 0.5 to the right, improve=114.753100, (0 missing)
## smoker < 0.5 to the left, improve= 30.605070, (0 missing)
## charges < 33047.5 to the left, improve= 24.727040, (0 missing)
## age < 41.5 to the left, improve= 13.870640, (0 missing)
## bmi < 31.01 to the left, improve= 9.525539, (0 missing)
## Surrogate splits:
## charges < 3220.372 to the right, agree=0.662, adj=0.210, (0 split)
## age < 25.5 to the right, agree=0.646, adj=0.173, (0 split)
## bmi < 26.6475 to the right, agree=0.579, adj=0.014, (0 split)
##
## Node number 4: 191 observations, complexity param=0.01576577
## predicted class=0 expected loss=0.07329843 P(node) =0.1785047
## class counts: 177 14
## probabilities: 0.927 0.073
## left son=8 (184 obs) right son=9 (7 obs)
## Primary splits:
## bmi < 17.575 to the right, improve=12.4802500, (0 missing)
## charges < 30225.63 to the left, improve= 1.7265910, (0 missing)
## children < 1.5 to the left, improve= 0.8043285, (0 missing)
## age < 63.5 to the left, improve= 0.7360038, (0 missing)
## region < 0.5 to the right, improve= 0.1247432, (0 missing)
##
## Node number 5: 53 observations, complexity param=0.01689189
## predicted class=1 expected loss=0.2830189 P(node) =0.04953271
## class counts: 15 38
## probabilities: 0.283 0.717
## left son=10 (23 obs) right son=11 (30 obs)
## Primary splits:
## children < 1.5 to the right, improve=11.074650, (0 missing)
## age < 41 to the left, improve= 6.509434, (0 missing)
## charges < 19479.9 to the left, improve= 4.805730, (0 missing)
## bmi < 19.1975 to the left, improve= 4.448209, (0 missing)
## region < 2.5 to the right, improve= 1.610444, (0 missing)
## Surrogate splits:
## bmi < 24.265 to the right, agree=0.660, adj=0.217, (0 split)
## region < 2.5 to the right, agree=0.623, adj=0.130, (0 split)
## charges < 16717.01 to the right, agree=0.604, adj=0.087, (0 split)
## age < 29.5 to the right, agree=0.585, adj=0.043, (0 split)
##
## Node number 6: 473 observations, complexity param=0.1283784
## predicted class=0 expected loss=0.4672304 P(node) =0.4420561
## class counts: 252 221
## probabilities: 0.533 0.467
## left son=12 (378 obs) right son=13 (95 obs)
## Primary splits:
## smoker < 0.5 to the left, improve=52.43251, (0 missing)
## charges < 30124.26 to the left, improve=45.66116, (0 missing)
## age < 40.5 to the left, improve=26.26169, (0 missing)
## bmi < 31.1925 to the left, improve=15.70518, (0 missing)
## children < 2.5 to the right, improve=14.09699, (0 missing)
## Surrogate splits:
## charges < 30124.26 to the left, agree=0.941, adj=0.705, (0 split)
## bmi < 26.1525 to the right, agree=0.801, adj=0.011, (0 split)
##
## Node number 7: 353 observations
## predicted class=1 expected loss=0 P(node) =0.3299065
## class counts: 0 353
## probabilities: 0.000 1.000
##
## Node number 8: 184 observations, complexity param=0.004504505
## predicted class=0 expected loss=0.03804348 P(node) =0.1719626
## class counts: 177 7
## probabilities: 0.962 0.038
## left son=16 (157 obs) right son=17 (27 obs)
## Primary splits:
## bmi < 25.3325 to the left, improve=2.14679700, (0 missing)
## charges < 30225.63 to the left, improve=1.86083400, (0 missing)
## children < 1.5 to the left, improve=1.28220600, (0 missing)
## age < 63.5 to the left, improve=0.86299570, (0 missing)
## region < 2.5 to the left, improve=0.08394667, (0 missing)
## Surrogate splits:
## charges < 28149.52 to the left, agree=0.859, adj=0.037, (0 split)
##
## Node number 9: 7 observations
## predicted class=1 expected loss=0 P(node) =0.006542056
## class counts: 0 7
## probabilities: 0.000 1.000
##
## Node number 10: 23 observations, complexity param=0.01689189
## predicted class=0 expected loss=0.3478261 P(node) =0.02149533
## class counts: 15 8
## probabilities: 0.652 0.348
## left son=20 (15 obs) right son=21 (8 obs)
## Primary splits:
## age < 41.5 to the left, improve=10.4347800, (0 missing)
## charges < 19621.16 to the left, improve= 7.2347830, (0 missing)
## bmi < 24.56 to the left, improve= 2.2501670, (0 missing)
## children < 2.5 to the right, improve= 0.7732441, (0 missing)
## region < 2.5 to the right, improve= 0.5328218, (0 missing)
## Surrogate splits:
## charges < 19621.16 to the left, agree=0.913, adj=0.75, (0 split)
## bmi < 24.56 to the left, agree=0.739, adj=0.25, (0 split)
##
## Node number 11: 30 observations
## predicted class=1 expected loss=0 P(node) =0.02803738
## class counts: 0 30
## probabilities: 0.000 1.000
##
## Node number 12: 378 observations, complexity param=0.04391892
## predicted class=0 expected loss=0.3492063 P(node) =0.353271
## class counts: 246 132
## probabilities: 0.651 0.349
## left son=24 (187 obs) right son=25 (191 obs)
## Primary splits:
## age < 40.5 to the left, improve=27.8931800, (0 missing)
## charges < 7146.168 to the left, improve=17.2151500, (0 missing)
## children < 1.5 to the right, improve=13.6844100, (0 missing)
## bmi < 31.1925 to the left, improve=12.0039700, (0 missing)
## region < 0.5 to the right, improve= 0.5812166, (0 missing)
## Surrogate splits:
## charges < 6659.237 to the left, agree=0.897, adj=0.791, (0 split)
## bmi < 31.3025 to the left, agree=0.566, adj=0.123, (0 split)
## sex < 0.5 to the right, agree=0.534, adj=0.059, (0 split)
## region < 0.5 to the right, agree=0.526, adj=0.043, (0 split)
## children < 3.5 to the right, agree=0.516, adj=0.021, (0 split)
##
## Node number 13: 95 observations, complexity param=0.005630631
## predicted class=1 expected loss=0.06315789 P(node) =0.08878505
## class counts: 6 89
## probabilities: 0.063 0.937
## left son=26 (15 obs) right son=27 (80 obs)
## Primary splits:
## charges < 21902.02 to the left, improve=2.6004390, (0 missing)
## bmi < 29.765 to the left, improve=2.2421050, (0 missing)
## children < 3.5 to the right, improve=1.7740200, (0 missing)
## age < 35.5 to the left, improve=0.8363513, (0 missing)
## region < 1.5 to the left, improve=0.1132164, (0 missing)
## Surrogate splits:
## bmi < 29.765 to the left, agree=0.905, adj=0.400, (0 split)
## age < 21 to the left, agree=0.863, adj=0.133, (0 split)
##
## Node number 16: 157 observations, complexity param=0.001126126
## predicted class=0 expected loss=0.006369427 P(node) =0.146729
## class counts: 156 1
## probabilities: 0.994 0.006
## left son=32 (147 obs) right son=33 (10 obs)
## Primary splits:
## bmi < 25.1275 to the left, improve=0.18726110, (0 missing)
## children < 1.5 to the left, improve=0.03073941, (0 missing)
## region < 0.5 to the right, improve=0.02981434, (0 missing)
## charges < 9030.432 to the left, improve=0.02362478, (0 missing)
## age < 44.5 to the left, improve=0.02004803, (0 missing)
##
## Node number 17: 27 observations, complexity param=0.004504505
## predicted class=0 expected loss=0.2222222 P(node) =0.02523364
## class counts: 21 6
## probabilities: 0.778 0.222
## left son=34 (19 obs) right son=35 (8 obs)
## Primary splits:
## children < 1.5 to the left, improve=6.3333330, (0 missing)
## charges < 9016.941 to the left, improve=2.4761900, (0 missing)
## age < 44.5 to the left, improve=1.8333330, (0 missing)
## bmi < 25.3825 to the right, improve=1.3333330, (0 missing)
## region < 1.5 to the left, improve=0.5333333, (0 missing)
## Surrogate splits:
## bmi < 25.3825 to the right, agree=0.741, adj=0.125, (0 split)
## charges < 12385.87 to the left, agree=0.741, adj=0.125, (0 split)
##
## Node number 20: 15 observations
## predicted class=0 expected loss=0 P(node) =0.01401869
## class counts: 15 0
## probabilities: 1.000 0.000
##
## Node number 21: 8 observations
## predicted class=1 expected loss=0 P(node) =0.007476636
## class counts: 0 8
## probabilities: 0.000 1.000
##
## Node number 24: 187 observations, complexity param=0.01351351
## predicted class=0 expected loss=0.1550802 P(node) =0.1747664
## class counts: 158 29
## probabilities: 0.845 0.155
## left son=48 (169 obs) right son=49 (18 obs)
## Primary splits:
## bmi < 40.1375 to the left, improve=10.42547000, (0 missing)
## children < 1.5 to the right, improve= 8.28593800, (0 missing)
## charges < 4170.071 to the right, improve= 3.18811300, (0 missing)
## age < 27.5 to the right, improve= 0.66374980, (0 missing)
## region < 2.5 to the left, improve= 0.04327544, (0 missing)
##
## Node number 25: 191 observations, complexity param=0.04391892
## predicted class=1 expected loss=0.460733 P(node) =0.1785047
## class counts: 88 103
## probabilities: 0.461 0.539
## left son=50 (56 obs) right son=51 (135 obs)
## Primary splits:
## children < 2.5 to the right, improve=10.187190, (0 missing)
## bmi < 34.49 to the left, improve= 7.423943, (0 missing)
## charges < 11881.66 to the right, improve= 2.994786, (0 missing)
## age < 53.5 to the right, improve= 1.934244, (0 missing)
## region < 1.5 to the left, improve= 1.254331, (0 missing)
## Surrogate splits:
## age < 62.5 to the right, agree=0.723, adj=0.054, (0 split)
##
## Node number 26: 15 observations, complexity param=0.005630631
## predicted class=1 expected loss=0.3333333 P(node) =0.01401869
## class counts: 5 10
## probabilities: 0.333 0.667
## left son=52 (5 obs) right son=53 (10 obs)
## Primary splits:
## children < 1.5 to the right, improve=6.6666670, (0 missing)
## charges < 19826.58 to the right, improve=1.4880950, (0 missing)
## bmi < 29.315 to the right, improve=0.9523810, (0 missing)
## age < 35.5 to the left, improve=0.5128205, (0 missing)
## region < 2.5 to the right, improve=0.1282051, (0 missing)
## Surrogate splits:
## bmi < 28.535 to the right, agree=0.733, adj=0.2, (0 split)
## charges < 19826.58 to the right, agree=0.733, adj=0.2, (0 split)
##
## Node number 27: 80 observations, complexity param=0.001126126
## predicted class=1 expected loss=0.0125 P(node) =0.07476636
## class counts: 1 79
## probabilities: 0.013 0.988
## left son=54 (9 obs) right son=55 (71 obs)
## Primary splits:
## bmi < 29.07 to the left, improve=0.19722220, (0 missing)
## charges < 33545.31 to the left, improve=0.14166670, (0 missing)
## children < 2.5 to the right, improve=0.07023810, (0 missing)
## age < 42.5 to the left, improve=0.03382353, (0 missing)
## region < 1.5 to the left, improve=0.02905405, (0 missing)
## Surrogate splits:
## charges < 29526.8 to the left, agree=0.975, adj=0.778, (0 split)
##
## Node number 32: 147 observations
## predicted class=0 expected loss=0 P(node) =0.1373832
## class counts: 147 0
## probabilities: 1.000 0.000
##
## Node number 33: 10 observations, complexity param=0.001126126
## predicted class=0 expected loss=0.1 P(node) =0.009345794
## class counts: 9 1
## probabilities: 0.900 0.100
## left son=66 (9 obs) right son=67 (1 obs)
## Primary splits:
## children < 1.5 to the left, improve=1.8000000, (0 missing)
## region < 0.5 to the right, improve=0.4666667, (0 missing)
## charges < 8768.868 to the left, improve=0.3000000, (0 missing)
## age < 43.5 to the left, improve=0.2000000, (0 missing)
## sex < 0.5 to the right, improve=0.2000000, (0 missing)
##
## Node number 34: 19 observations
## predicted class=0 expected loss=0 P(node) =0.01775701
## class counts: 19 0
## probabilities: 1.000 0.000
##
## Node number 35: 8 observations, complexity param=0.004504505
## predicted class=1 expected loss=0.25 P(node) =0.007476636
## class counts: 2 6
## probabilities: 0.250 0.750
## left son=70 (2 obs) right son=71 (6 obs)
## Primary splits:
## age < 38 to the left, improve=3.0000000, (0 missing)
## charges < 7018.252 to the left, improve=3.0000000, (0 missing)
## bmi < 25.7425 to the right, improve=1.6666670, (0 missing)
## children < 2.5 to the left, improve=0.6000000, (0 missing)
## region < 0.5 to the left, improve=0.3333333, (0 missing)
## Surrogate splits:
## charges < 7018.252 to the left, agree=1.000, adj=1.0, (0 split)
## bmi < 25.7425 to the right, agree=0.875, adj=0.5, (0 split)
##
## Node number 48: 169 observations, complexity param=0.01276276
## predicted class=0 expected loss=0.1005917 P(node) =0.1579439
## class counts: 152 17
## probabilities: 0.899 0.101
## left son=96 (99 obs) right son=97 (70 obs)
## Primary splits:
## children < 1.5 to the right, improve=4.8370250, (0 missing)
## bmi < 31.1925 to the left, improve=4.0863750, (0 missing)
## charges < 4170.071 to the right, improve=2.2165900, (0 missing)
## age < 32.5 to the right, improve=0.8416554, (0 missing)
## region < 1.5 to the right, improve=0.2004161, (0 missing)
## Surrogate splits:
## charges < 3393.167 to the right, agree=0.692, adj=0.257, (0 split)
## bmi < 28.4525 to the right, agree=0.598, adj=0.029, (0 split)
## age < 18.5 to the right, agree=0.592, adj=0.014, (0 split)
##
## Node number 49: 18 observations, complexity param=0.006756757
## predicted class=1 expected loss=0.3333333 P(node) =0.01682243
## class counts: 6 12
## probabilities: 0.333 0.667
## left son=98 (9 obs) right son=99 (9 obs)
## Primary splits:
## children < 1.5 to the right, improve=4.0000000, (0 missing)
## age < 28.5 to the right, improve=1.5384620, (0 missing)
## bmi < 45.06 to the left, improve=1.5384620, (0 missing)
## charges < 3979.797 to the right, improve=1.5384620, (0 missing)
## region < 0.5 to the right, improve=0.2352941, (0 missing)
## Surrogate splits:
## bmi < 42.92 to the right, agree=0.778, adj=0.556, (0 split)
## age < 26.5 to the right, agree=0.722, adj=0.444, (0 split)
## charges < 3551.876 to the right, agree=0.722, adj=0.444, (0 split)
## region < 1.5 to the right, agree=0.611, adj=0.222, (0 split)
## sex < 0.5 to the right, agree=0.556, adj=0.111, (0 split)
##
## Node number 50: 56 observations, complexity param=0.03378378
## predicted class=0 expected loss=0.2857143 P(node) =0.05233645
## class counts: 40 16
## probabilities: 0.714 0.286
## left son=100 (41 obs) right son=101 (15 obs)
## Primary splits:
## bmi < 29.9725 to the right, improve=20.9059200, (0 missing)
## age < 53.5 to the right, improve= 2.7501910, (0 missing)
## children < 4.5 to the left, improve= 2.1164020, (0 missing)
## charges < 11258.98 to the right, improve= 1.4404760, (0 missing)
## region < 0.5 to the right, improve= 0.3180124, (0 missing)
##
## Node number 51: 135 observations, complexity param=0.02702703
## predicted class=1 expected loss=0.3555556 P(node) =0.1261682
## class counts: 48 87
## probabilities: 0.356 0.644
## left son=102 (78 obs) right son=103 (57 obs)
## Primary splits:
## bmi < 33.5825 to the left, improve=18.105530, (0 missing)
## region < 1.5 to the left, improve= 2.115893, (0 missing)
## charges < 7099.057 to the left, improve= 1.866667, (0 missing)
## age < 61.5 to the left, improve= 1.312821, (0 missing)
## sex < 0.5 to the left, improve= 0.533928, (0 missing)
## Surrogate splits:
## charges < 14177.35 to the left, agree=0.622, adj=0.105, (0 split)
## age < 60.5 to the left, agree=0.600, adj=0.053, (0 split)
##
## Node number 52: 5 observations
## predicted class=0 expected loss=0 P(node) =0.004672897
## class counts: 5 0
## probabilities: 1.000 0.000
##
## Node number 53: 10 observations
## predicted class=1 expected loss=0 P(node) =0.009345794
## class counts: 0 10
## probabilities: 0.000 1.000
##
## Node number 54: 9 observations, complexity param=0.001126126
## predicted class=1 expected loss=0.1111111 P(node) =0.008411215
## class counts: 1 8
## probabilities: 0.111 0.889
## left son=108 (1 obs) right son=109 (8 obs)
## Primary splits:
## charges < 30828.06 to the right, improve=1.7777780, (0 missing)
## age < 44.5 to the left, improve=0.7777778, (0 missing)
## bmi < 28.2625 to the right, improve=0.7777778, (0 missing)
## children < 2.5 to the right, improve=0.7777778, (0 missing)
## sex < 0.5 to the right, improve=0.1777778, (0 missing)
##
## Node number 55: 71 observations
## predicted class=1 expected loss=0 P(node) =0.06635514
## class counts: 0 71
## probabilities: 0.000 1.000
##
## Node number 66: 9 observations
## predicted class=0 expected loss=0 P(node) =0.008411215
## class counts: 9 0
## probabilities: 1.000 0.000
##
## Node number 67: 1 observations
## predicted class=1 expected loss=0 P(node) =0.0009345794
## class counts: 0 1
## probabilities: 0.000 1.000
##
## Node number 70: 2 observations
## predicted class=0 expected loss=0 P(node) =0.001869159
## class counts: 2 0
## probabilities: 1.000 0.000
##
## Node number 71: 6 observations
## predicted class=1 expected loss=0 P(node) =0.005607477
## class counts: 0 6
## probabilities: 0.000 1.000
##
## Node number 96: 99 observations
## predicted class=0 expected loss=0 P(node) =0.09252336
## class counts: 99 0
## probabilities: 1.000 0.000
##
## Node number 97: 70 observations, complexity param=0.01276276
## predicted class=0 expected loss=0.2428571 P(node) =0.06542056
## class counts: 53 17
## probabilities: 0.757 0.243
## left son=194 (41 obs) right son=195 (29 obs)
## Primary splits:
## bmi < 31.1275 to the left, improve=11.6738900, (0 missing)
## age < 32.5 to the right, improve= 1.4814940, (0 missing)
## charges < 4194.078 to the right, improve= 1.2000940, (0 missing)
## region < 1.5 to the right, improve= 1.0296530, (0 missing)
## sex < 0.5 to the right, improve= 0.3474323, (0 missing)
## Surrogate splits:
## charges < 4194.078 to the right, agree=0.671, adj=0.207, (0 split)
## age < 20.5 to the right, agree=0.629, adj=0.103, (0 split)
##
## Node number 98: 9 observations, complexity param=0.006756757
## predicted class=0 expected loss=0.3333333 P(node) =0.008411215
## class counts: 6 3
## probabilities: 0.667 0.333
## left son=196 (6 obs) right son=197 (3 obs)
## Primary splits:
## bmi < 45.06 to the left, improve=4.00, (0 missing)
## age < 28.5 to the right, improve=1.00, (0 missing)
## sex < 0.5 to the left, improve=1.00, (0 missing)
## charges < 3979.797 to the right, improve=1.00, (0 missing)
## children < 4 to the right, improve=0.25, (0 missing)
## Surrogate splits:
## charges < 4720.013 to the right, agree=0.778, adj=0.333, (0 split)
##
## Node number 99: 9 observations
## predicted class=1 expected loss=0 P(node) =0.008411215
## class counts: 0 9
## probabilities: 0.000 1.000
##
## Node number 100: 41 observations, complexity param=0.002252252
## predicted class=0 expected loss=0.02439024 P(node) =0.03831776
## class counts: 40 1
## probabilities: 0.976 0.024
## left son=200 (40 obs) right son=201 (1 obs)
## Primary splits:
## bmi < 45.725 to the left, improve=1.95122000, (0 missing)
## children < 4.5 to the left, improve=1.95122000, (0 missing)
## charges < 12543.91 to the left, improve=0.06886657, (0 missing)
## age < 52.5 to the right, improve=0.05648267, (0 missing)
## region < 1.5 to the left, improve=0.03817603, (0 missing)
##
## Node number 101: 15 observations
## predicted class=1 expected loss=0 P(node) =0.01401869
## class counts: 0 15
## probabilities: 0.000 1.000
##
## Node number 102: 78 observations, complexity param=0.02027027
## predicted class=0 expected loss=0.4230769 P(node) =0.0728972
## class counts: 45 33
## probabilities: 0.577 0.423
## left son=204 (31 obs) right son=205 (47 obs)
## Primary splits:
## bmi < 29.34 to the left, improve=2.8017000, (0 missing)
## charges < 7347.962 to the left, improve=2.3945700, (0 missing)
## age < 43.5 to the left, improve=0.9086691, (0 missing)
## region < 0.5 to the right, improve=0.6350258, (0 missing)
## sex < 0.5 to the left, improve=0.2263796, (0 missing)
## Surrogate splits:
## charges < 7205.138 to the left, agree=0.654, adj=0.129, (0 split)
## age < 42.5 to the left, agree=0.615, adj=0.032, (0 split)
##
## Node number 103: 57 observations, complexity param=0.002252252
## predicted class=1 expected loss=0.05263158 P(node) =0.05327103
## class counts: 3 54
## probabilities: 0.053 0.947
## left son=206 (8 obs) right son=207 (49 obs)
## Primary splits:
## bmi < 39.9875 to the right, improve=0.7250269, (0 missing)
## children < 1.5 to the right, improve=0.4668192, (0 missing)
## charges < 8651.546 to the right, improve=0.1706970, (0 missing)
## age < 45.5 to the right, improve=0.1342105, (0 missing)
## region < 1.5 to the left, improve=0.1207185, (0 missing)
##
## Node number 108: 1 observations
## predicted class=0 expected loss=0 P(node) =0.0009345794
## class counts: 1 0
## probabilities: 1.000 0.000
##
## Node number 109: 8 observations
## predicted class=1 expected loss=0 P(node) =0.007476636
## class counts: 0 8
## probabilities: 0.000 1.000
##
## Node number 194: 41 observations
## predicted class=0 expected loss=0 P(node) =0.03831776
## class counts: 41 0
## probabilities: 1.000 0.000
##
## Node number 195: 29 observations, complexity param=0.01276276
## predicted class=1 expected loss=0.4137931 P(node) =0.0271028
## class counts: 12 17
## probabilities: 0.414 0.586
## left son=390 (12 obs) right son=391 (17 obs)
## Primary splits:
## bmi < 35 to the right, improve=14.0689700, (0 missing)
## region < 1.5 to the right, improve= 2.5003380, (0 missing)
## age < 18.5 to the left, improve= 1.4763730, (0 missing)
## charges < 18933.33 to the right, improve= 1.4763730, (0 missing)
## sex < 0.5 to the right, improve= 0.8880131, (0 missing)
## Surrogate splits:
## region < 1.5 to the right, agree=0.690, adj=0.250, (0 split)
## age < 18.5 to the left, agree=0.655, adj=0.167, (0 split)
## charges < 18933.33 to the right, agree=0.655, adj=0.167, (0 split)
## sex < 0.5 to the right, agree=0.621, adj=0.083, (0 split)
##
## Node number 196: 6 observations
## predicted class=0 expected loss=0 P(node) =0.005607477
## class counts: 6 0
## probabilities: 1.000 0.000
##
## Node number 197: 3 observations
## predicted class=1 expected loss=0 P(node) =0.002803738
## class counts: 0 3
## probabilities: 0.000 1.000
##
## Node number 200: 40 observations
## predicted class=0 expected loss=0 P(node) =0.03738318
## class counts: 40 0
## probabilities: 1.000 0.000
##
## Node number 201: 1 observations
## predicted class=1 expected loss=0 P(node) =0.0009345794
## class counts: 0 1
## probabilities: 0.000 1.000
##
## Node number 204: 31 observations, complexity param=0.01801802
## predicted class=0 expected loss=0.2580645 P(node) =0.02897196
## class counts: 23 8
## probabilities: 0.742 0.258
## left son=408 (23 obs) right son=409 (8 obs)
## Primary splits:
## children < 1.5 to the left, improve=11.8709700, (0 missing)
## bmi < 26.8875 to the right, improve= 2.4843010, (0 missing)
## charges < 7676.924 to the left, improve= 1.2043010, (0 missing)
## age < 47.5 to the right, improve= 0.9043011, (0 missing)
## region < 2.5 to the left, improve= 0.2218449, (0 missing)
## Surrogate splits:
## bmi < 26.2675 to the right, agree=0.806, adj=0.25, (0 split)
##
## Node number 205: 47 observations, complexity param=0.02027027
## predicted class=1 expected loss=0.4680851 P(node) =0.04392523
## class counts: 22 25
## probabilities: 0.468 0.532
## left son=410 (23 obs) right son=411 (24 obs)
## Primary splits:
## children < 1.5 to the right, improve=11.5455600, (0 missing)
## bmi < 32.865 to the right, improve= 1.2328270, (0 missing)
## charges < 11881.66 to the right, improve= 0.8133462, (0 missing)
## region < 1.5 to the left, improve= 0.5764165, (0 missing)
## age < 43.5 to the left, improve= 0.4747681, (0 missing)
## Surrogate splits:
## charges < 11880.23 to the right, agree=0.681, adj=0.348, (0 split)
## age < 45.5 to the left, agree=0.596, adj=0.174, (0 split)
## bmi < 29.7825 to the left, agree=0.574, adj=0.130, (0 split)
## region < 0.5 to the right, agree=0.574, adj=0.130, (0 split)
##
## Node number 206: 8 observations, complexity param=0.002252252
## predicted class=1 expected loss=0.25 P(node) =0.007476636
## class counts: 2 6
## probabilities: 0.250 0.750
## left son=412 (2 obs) right son=413 (6 obs)
## Primary splits:
## bmi < 43.19 to the left, improve=3.0000000, (0 missing)
## children < 1.5 to the right, improve=1.6666670, (0 missing)
## age < 49.5 to the left, improve=0.6000000, (0 missing)
## region < 1.5 to the left, improve=0.3333333, (0 missing)
## charges < 8651.546 to the right, improve=0.3333333, (0 missing)
## Surrogate splits:
## children < 1.5 to the right, agree=0.875, adj=0.5, (0 split)
##
## Node number 207: 49 observations, complexity param=0.001126126
## predicted class=1 expected loss=0.02040816 P(node) =0.04579439
## class counts: 1 48
## probabilities: 0.020 0.980
## left son=414 (5 obs) right son=415 (44 obs)
## Primary splits:
## bmi < 34.3075 to the left, improve=0.35918370, (0 missing)
## age < 56.5 to the right, improve=0.18140590, (0 missing)
## region < 0.5 to the left, improve=0.18140590, (0 missing)
## charges < 12935.13 to the right, improve=0.11302980, (0 missing)
## children < 1.5 to the right, improve=0.05918367, (0 missing)
##
## Node number 390: 12 observations
## predicted class=0 expected loss=0 P(node) =0.01121495
## class counts: 12 0
## probabilities: 1.000 0.000
##
## Node number 391: 17 observations
## predicted class=1 expected loss=0 P(node) =0.01588785
## class counts: 0 17
## probabilities: 0.000 1.000
##
## Node number 408: 23 observations
## predicted class=0 expected loss=0 P(node) =0.02149533
## class counts: 23 0
## probabilities: 1.000 0.000
##
## Node number 409: 8 observations
## predicted class=1 expected loss=0 P(node) =0.007476636
## class counts: 0 8
## probabilities: 0.000 1.000
##
## Node number 410: 23 observations, complexity param=0.009009009
## predicted class=0 expected loss=0.173913 P(node) =0.02149533
## class counts: 19 4
## probabilities: 0.826 0.174
## left son=820 (19 obs) right son=821 (4 obs)
## Primary splits:
## bmi < 29.9375 to the right, improve=6.60869600, (0 missing)
## charges < 30647.57 to the left, improve=1.42687700, (0 missing)
## region < 1.5 to the left, improve=0.56254180, (0 missing)
## age < 44.5 to the right, improve=0.25155280, (0 missing)
## sex < 0.5 to the left, improve=0.02408027, (0 missing)
##
## Node number 411: 24 observations, complexity param=0.006756757
## predicted class=1 expected loss=0.125 P(node) =0.02242991
## class counts: 3 21
## probabilities: 0.125 0.875
## left son=822 (3 obs) right son=823 (21 obs)
## Primary splits:
## bmi < 29.965 to the left, improve=5.2500000, (0 missing)
## charges < 9289.083 to the left, improve=0.7500000, (0 missing)
## age < 49.5 to the left, improve=0.5357143, (0 missing)
## sex < 0.5 to the left, improve=0.0472028, (0 missing)
## region < 1.5 to the left, improve=0.0472028, (0 missing)
##
## Node number 412: 2 observations
## predicted class=0 expected loss=0 P(node) =0.001869159
## class counts: 2 0
## probabilities: 1.000 0.000
##
## Node number 413: 6 observations
## predicted class=1 expected loss=0 P(node) =0.005607477
## class counts: 0 6
## probabilities: 0.000 1.000
##
## Node number 414: 5 observations, complexity param=0.001126126
## predicted class=1 expected loss=0.2 P(node) =0.004672897
## class counts: 1 4
## probabilities: 0.200 0.800
## left son=828 (1 obs) right son=829 (4 obs)
## Primary splits:
## age < 55.5 to the right, improve=1.6, (0 missing)
## bmi < 34.2525 to the right, improve=1.6, (0 missing)
## children < 1.5 to the right, improve=1.6, (0 missing)
## charges < 12024.66 to the right, improve=1.6, (0 missing)
## sex < 0.5 to the left, improve=0.6, (0 missing)
##
## Node number 415: 44 observations
## predicted class=1 expected loss=0 P(node) =0.0411215
## class counts: 0 44
## probabilities: 0.000 1.000
##
## Node number 820: 19 observations
## predicted class=0 expected loss=0 P(node) =0.01775701
## class counts: 19 0
## probabilities: 1.000 0.000
##
## Node number 821: 4 observations
## predicted class=1 expected loss=0 P(node) =0.003738318
## class counts: 0 4
## probabilities: 0.000 1.000
##
## Node number 822: 3 observations
## predicted class=0 expected loss=0 P(node) =0.002803738
## class counts: 3 0
## probabilities: 1.000 0.000
##
## Node number 823: 21 observations
## predicted class=1 expected loss=0 P(node) =0.01962617
## class counts: 0 21
## probabilities: 0.000 1.000
##
## Node number 828: 1 observations
## predicted class=0 expected loss=0 P(node) =0.0009345794
## class counts: 1 0
## probabilities: 1.000 0.000
##
## Node number 829: 4 observations
## predicted class=1 expected loss=0 P(node) =0.003738318
## class counts: 0 4
## probabilities: 0.000 1.000
#Measures of Predictive Performance
rootnode_err <- sum(insurance.data.train$insuranceclaim==1)/nrow(insurance.data.train)
prelerr = pfit.allp$cptable[which.min(pfit.allp$cptable[, "rel error"]), "rel error"]
(presub.err_rate <- rootnode_err*prelerr)
## [1] 0
rootnode_err <- sum(insurance.data.train$insuranceclaim==1)/nrow(insurance.data.train)
pxerr = pfit.allp$cptable[which.min(pfit.allp$cptable[, "xerror"]), "xerror"]
(pcv.err_rate <- rootnode_err*pxerr)
## [1] 0.03030647
The presubstitution error rate, which evaluates the error rate if predictions are made solely based on the overall class distribution in the training data, is 0%.
The post-pruning cross-validation error rate, representing the estimated error rate after pruning the tree using cross-validation, is approximately 3.03%.
In conclusion, the post-pruning cross-validation error suggests that the pruned classification tree performs well in terms of predictive accuracy on unseen data.
test_df <- data.frame(actual = insurance.data.test$insuranceclaim, pred = NA)
test_df$pred <- predict(pfit.allp, newdata = insurance.data.test, type = "class")
(conf_matrix_pruned_tree <-
table(test_df$pred, test_df$actual)) #confusion matrix
##
## 0 1
## 0 108 3
## 1 3 154
sensitivity(conf_matrix_pruned_tree)
## [1] 0.972973
specificity(conf_matrix_pruned_tree)
## [1] 0.9808917
# Missclassification error rate:
(conf_matrix_pruned_tree[1, 2] +
conf_matrix_pruned_tree[2, 1])/sum(conf_matrix_pruned_tree)
## [1] 0.02238806
TP <- conf_matrix_pruned_tree[2, 2] # True Positives
TN <- conf_matrix_pruned_tree[1, 1] # True Negatives
FP <- conf_matrix_pruned_tree[1, 2] # False Positives
FN <- conf_matrix_pruned_tree[2, 1] # False Negatives
# Calculate Accuracy
accuracy <- (TP + TN) / (TP + TN + FP + FN)
# Print the accuracy
print(paste("Accuracy:", round(accuracy, 4)))
## [1] "Accuracy: 0.9776"
The pruned classification tree demonstrates exceptional performance on the test data, with sensitivity, reaching approximately 97.30%. Moreover, the specificity is impressively high at approximately 98.09%, indicating the model’s proficiency in correctly recognizing instances where no insurance claims are made.
In terms of overall accuracy, the pruned tree exhibits a remarkably low mis classification error rate of about 2.24%. This metric takes into account both false positives (incorrectly predicted insurance claims) and false negatives (missed insurance claims), offering a comprehensive view of the model’s performance. The minimal mis classification error underscores the pruned tree’s efficacy in making precise predictions across the diverse scenarios presented in the test data. Overall, these results highlight the robustness and accuracy of the pruned classification tree in effectively identifying both positive and negative cases in the context of insurance claims.
#4. Random Forest
The random forest (RF) is an ensemble learning method which consists of aggregating a large number of decision trees to avoid overfitting and build a better classification model
The word random appears because in training the data, predictors are chosen randomly from the full set of predictors.
The word forest is used because output from multiple trees are used to make a decision. That is, two types of randomnesses go into constructing a random forest:
each tree is built on a random sample from the dataset, and
at each tree node, a subset of features are randomly selected to generate the best split.
Out-of-bag (OOB) observations from the first bootstrap sample are those observations in the training sample that did not enter the first bootstrap sample. Similarly, we will have OOB observations corresponding to each bootstrap sample (decision tree).
# Random Forest
fit.rf.ranger <- ranger(insuranceclaim ~ ., data = insurance.data.train,
importance = 'impurity', mtry = 3)
print(fit.rf.ranger)
## Ranger result
##
## Call:
## ranger(insuranceclaim ~ ., data = insurance.data.train, importance = "impurity", mtry = 3)
##
## Type: Regression
## Number of trees: 500
## Sample size: 1070
## Number of independent variables: 7
## Mtry: 3
## Target node size: 5
## Variable importance mode: impurity
## Splitrule: variance
## OOB prediction error (MSE): 0.03148481
## R squared (OOB): 0.8704297
The Ranger regression model, built on the insurance data, comprises 500 trees with a sample size of 1070 and incorporates seven independent variables. For each split, the model randomly samples three variables, and the target node size is set at 5. The variable importance is assessed based on impurity, and the split rule is determined by variance. The out-of-bag prediction error (mean squared error) is measured at 0.03085552, indicating a relatively low prediction error, while the R-squared value stands at 0.8730195, signifying a high proportion of explained variance in the target variable. These results collectively suggest that the Ranger regression model performs well in predicting insurance claims, offering accuracy and a strong ability to capture variability in the data.
After training a RF, we would like to understand which variables have the most predictive power. Variables with high importance will have a significant impact on the binary outcomes, while we may consider dropping variables with low importance from the model (leading to a more parsimonious model). We can use the vi() function in the R package vip to extract and print a tibble of variable importance scores. We can also construct a variable importance plot using the vip() function, as shown below.
(v1 <- vi(fit.rf.ranger))
## # A tibble: 7 × 2
## Variable Importance
## <chr> <dbl>
## 1 bmi 93.1
## 2 children 73.5
## 3 charges 34.5
## 4 smoker 24.9
## 5 age 20.9
## 6 region 3.69
## 7 sex 1.37
vip_plot <- vip(v1)
vip_plot + ggtitle("Variable Importance Plot for Insurance Data")
Bmi, Children are having most predictive power where as
Charges, smoker are have intermediate power and Sex, Region, Age
variables are having less predictive power an let’s develop a model by
droping those columns one by one.
# Assuming our predictions are stored in the variable 'pred'
pred <- predict(fit.rf.ranger, data = insurance.data.test)
# Create a data frame with actual and predicted values
test_df <- data.frame(actual = insurance.data.test$insuranceclaim,
pred = ifelse(pred$predictions > 0.5, 1, 0))
# Create a confusion matrix
conf_matrix_rf <- table(test_df$pred, test_df$actual)
# Display the confusion matrix
print(conf_matrix_rf)
##
## 0 1
## 0 106 2
## 1 5 155
The Random Forest model on the test data achieved 155 True Positives, correctly predicting insurance claims, and 107 True Negatives, accurately predicting cases with no insurance claims. However, it made 4 False Positives, wrongly predicting claims where none occurred, and 2 False Negatives, missing actual claims. Overall, the model demonstrates good predictive accuracy but has some room for improvement in minimizing false predictions.
# Sensitivity
sensitivity(conf_matrix_rf)
## [1] 0.954955
# Specificity
specificity(conf_matrix_rf)
## [1] 0.9872611
# Missclassification error rate:
(conf_matrix_rf[1,2] + conf_matrix_rf[2,1])/sum(conf_matrix_rf)
## [1] 0.0261194
TP <- conf_matrix_rf[2, 2] # True Positives
TN <- conf_matrix_rf[1, 1] # True Negatives
FP <- conf_matrix_rf[1, 2] # False Positives
FN <- conf_matrix_rf[2, 1] # False Negatives
# Calculate Accuracy
accuracy <- (TP + TN) / (TP + TN + FP + FN)
# Print the accuracy
print(paste("Accuracy:", round(accuracy, 4)))
## [1] "Accuracy: 0.9739"
The Random Forest model exhibits robust performance on the test data, capturing approximately 96.40% of actual insurance claims and accurately identifying around 98.73% of no insurance claims. The model’s overall accuracy remains high, with a misclassification error rate of approximately 2.24%, reflecting its effectiveness in classifying instances. This indicates a reliable and well-performing model for predicting insurance claims.
#Dropping the columns which are having the less vip value
# dropped the sex column
fit.rf.ranger <- ranger(insuranceclaim ~ bmi+children+age+smoker+charges+region, data = insurance.data.train,
importance = 'impurity', mtry = 3)
print(fit.rf.ranger)
## Ranger result
##
## Call:
## ranger(insuranceclaim ~ bmi + children + age + smoker + charges + region, data = insurance.data.train, importance = "impurity", mtry = 3)
##
## Type: Regression
## Number of trees: 500
## Sample size: 1070
## Number of independent variables: 6
## Mtry: 3
## Target node size: 5
## Variable importance mode: impurity
## Splitrule: variance
## OOB prediction error (MSE): 0.02560531
## R squared (OOB): 0.8946258
(v1 <- vi(fit.rf.ranger))
## # A tibble: 6 × 2
## Variable Importance
## <chr> <dbl>
## 1 bmi 96.7
## 2 children 75.2
## 3 charges 33.0
## 4 smoker 26.1
## 5 age 20.1
## 6 region 2.98
vip_plot <- vip(v1)
vip_plot + ggtitle("Variable Importance Plot - Bmi, Children, Smoker, Charges, Age, Region")
pred <- predict(fit.rf.ranger, data = insurance.data.test)
# Create a data frame with actual and predicted values
test_df <- data.frame(actual = insurance.data.test$insuranceclaim,
pred = ifelse(pred$predictions > 0.5, 1, 0))
# Create a confusion matrix
conf_matrix_rf <- table(test_df$pred, test_df$actual)
# Display the confusion matrix
print(conf_matrix_rf)
##
## 0 1
## 0 109 1
## 1 2 156
# Sensitivity
sensitivity(conf_matrix_rf)
## [1] 0.981982
# Specificity
specificity(conf_matrix_rf)
## [1] 0.9936306
# Missclassification error rate:
(conf_matrix_rf[1,2] + conf_matrix_rf[2,1])/sum(conf_matrix_rf)
## [1] 0.01119403
The Random Forest model assigns importance scores to different predictors for predicting insurance claims. BMI emerges as a highly influential factor (96.19), indicating its significant impact on predictions. The number of children follows closely, with a substantial importance score of 75.34, emphasizing its strong influence on claim likelihood. Charges contribute significantly (33.73), while the smoking status, age, and region also play roles, albeit to varying extents. Specifically, being a smoker has moderate importance (25.58), age is important (20.44), and region is comparatively less influential (2.90). In summary, BMI and the number of children are the most critical factors, shaping the Random Forest model’s predictions for insurance claims.
There is a significant impact on removing the columns with the less predicitve power as the sensitivity and specificity got improved.
# dropped the sex and region column
fit.rf.ranger <- ranger(insuranceclaim ~ bmi+children+age+smoker+charges, data = insurance.data.train,
importance = 'impurity', mtry = 3)
print(fit.rf.ranger)
## Ranger result
##
## Call:
## ranger(insuranceclaim ~ bmi + children + age + smoker + charges, data = insurance.data.train, importance = "impurity", mtry = 3)
##
## Type: Regression
## Number of trees: 500
## Sample size: 1070
## Number of independent variables: 5
## Mtry: 3
## Target node size: 5
## Variable importance mode: impurity
## Splitrule: variance
## OOB prediction error (MSE): 0.02029162
## R squared (OOB): 0.9164934
(v1 <- vi(fit.rf.ranger))
## # A tibble: 5 × 2
## Variable Importance
## <chr> <dbl>
## 1 bmi 99.6
## 2 children 78.1
## 3 charges 30.3
## 4 smoker 28.1
## 5 age 19.6
vip_plot <- vip(v1)
vip_plot + ggtitle("Variable Importance Plot - Bmi, Children, Smoker, Charges, Age")
pred <- predict(fit.rf.ranger, data = insurance.data.test)
# Create a data frame with actual and predicted values
test_df <- data.frame(actual = insurance.data.test$insuranceclaim,
pred = ifelse(pred$predictions > 0.5, 1, 0))
# Create a confusion matrix
conf_matrix_rf <- table(test_df$pred, test_df$actual)
# Display the confusion matrix
print(conf_matrix_rf)
##
## 0 1
## 0 109 1
## 1 2 156
# Sensitivity
sensitivity(conf_matrix_rf)
## [1] 0.981982
# Specificity
specificity(conf_matrix_rf)
## [1] 0.9936306
# Missclassification error rate:
(conf_matrix_rf[1,2] + conf_matrix_rf[2,1])/sum(conf_matrix_rf)
## [1] 0.01119403
pred <- predict(fit.rf.ranger, data = insurance.data.train)
# Create a data frame with actual and predicted values
test_df <- data.frame(actual = insurance.data.train$insuranceclaim,
pred = ifelse(pred$predictions > 0.5, 1, 0))
# Create a confusion matrix
tab <- table(test_df$pred, test_df$actual)
sum(diag(tab))/sum(tab)
## [1] 1
sensitivity(tab)
## [1] 1
specificity(tab)
## [1] 1
The Ranger Random Forest model, built on predictors including BMI, number of children, age, smoker status, and charges, exhibits high predictive performance. The prediction error is low (MSE = 0.0204), indicating accurate predictions. The R-squared value of 0.916 signifies the model’s capability to explain variance in the data. The confusion matrix reveals a high accuracy of 98.20%, with sensitivity (True Positive Rate) and specificity (True Negative Rate) at 99.36% and 98.20%, respectively. The misclassification error rate is minimal at 1.12%. In terms of variable importance, BMI stands out as the most crucial predictor (99.61), followed by the number of children (79.02), charges (30.34), smoker status (27.86), and age (19.23). This suggests that BMI and the number of children play pivotal roles in predicting insurance claims, as emphasized by their high importance scores.
# dropped the sex, age and region column
fit.rf.ranger <- ranger(insuranceclaim ~ bmi+children+smoker+charges, data = insurance.data.train,
importance = 'impurity', mtry = 3)
print(fit.rf.ranger)
## Ranger result
##
## Call:
## ranger(insuranceclaim ~ bmi + children + smoker + charges, data = insurance.data.train, importance = "impurity", mtry = 3)
##
## Type: Regression
## Number of trees: 500
## Sample size: 1070
## Number of independent variables: 4
## Mtry: 3
## Target node size: 5
## Variable importance mode: impurity
## Splitrule: variance
## OOB prediction error (MSE): 0.02506871
## R squared (OOB): 0.8968341
(v1 <- vi(fit.rf.ranger))
## # A tibble: 4 × 2
## Variable Importance
## <chr> <dbl>
## 1 bmi 101.
## 2 children 80.3
## 3 charges 41.6
## 4 smoker 31.6
vip_plot <- vip(v1)
vip_plot + ggtitle("Variable Importance Plot - Bmi, Children, Smoker, Charges")
pred <- predict(fit.rf.ranger, data = insurance.data.test)
# Create a data frame with actual and predicted values
test_df <- data.frame(actual = insurance.data.test$insuranceclaim,
pred = ifelse(pred$predictions > 0.5, 1, 0))
# Create a confusion matrix
conf_matrix_rf <- table(test_df$pred, test_df$actual)
# Display the confusion matrix
print(conf_matrix_rf)
##
## 0 1
## 0 109 3
## 1 2 154
# Sensitivity
sensitivity(conf_matrix_rf)
## [1] 0.981982
# Specificity
specificity(conf_matrix_rf)
## [1] 0.9808917
# Missclassification error rate:
(conf_matrix_rf[1,2] + conf_matrix_rf[2,1])/sum(conf_matrix_rf)
## [1] 0.01865672
TP <- conf_matrix_rf[2, 2] # True Positives
TN <- conf_matrix_rf[1, 1] # True Negatives
FP <- conf_matrix_rf[1, 2] # False Positives
FN <- conf_matrix_rf[2, 1] # False Negatives
# Calculate Accuracy
accuracy <- (TP + TN) / (TP + TN + FP + FN)
# Print the accuracy
print(paste("Accuracy:", round(accuracy, 4)))
## [1] "Accuracy: 0.9813"
pred <- predict(fit.rf.ranger, data = insurance.data.train)
# Create a data frame with actual and predicted values
test_df <- data.frame(actual = insurance.data.train$insuranceclaim,
pred = ifelse(pred$predictions > 0.5, 1, 0))
# Create a confusion matrix
tab <- table(test_df$pred, test_df$actual)
sum(diag(tab))/sum(tab)
## [1] 0.9990654
sensitivity(tab)
## [1] 0.9977477
specificity(tab)
## [1] 1
The Ranger Random Forest model, constructed with predictors including BMI, number of children, smoker status, and charges, demonstrates strong predictive performance. The prediction error is relatively low, with a mean squared error (MSE) of 0.0247, indicating accurate predictions. The R-squared value of 0.898 suggests the model’s ability to explain variance in the data. The confusion matrix reveals a high accuracy of 98.20%, with sensitivity (True Positive Rate) and specificity (True Negative Rate) at 98.09% and 98.20%, respectively. The misclassification error rate is moderate at 1.87%. In terms of variable importance, BMI is identified as the most crucial predictor, with an importance score of 101.22, followed by the number of children (80.55), charges (42.08), and smoker status (31.58). This underscores the significance of BMI and the number of children in predicting insurance claims, as highlighted by their high importance scores.
Results looks similar after dropping the region column to that of the age and sex.
Like random forests, boosting is also an out-of-the box learning algorithm. It gives good predictive performance for the response, usually in high-dimensional settings, with a large number of features.
Random forests build an ensemble of independent deep trees. In contrast, gradient boosting algorithms (GBMs) successively build an ensemble of shallow trees, each tree learning from the previous tree. When combined, these trees provide a highly accurate predictive algorithm.
For binary response modeling, the idea of boosting was introduced to improve the performance of weak learners. This was done by resampling the training data responses, giving more weight to the misclassified ones, thereby leading to a refined classifier (binary model) which would boost feature performance, especially in ambiguous areas of the feature space. A popular variant is the gradient boosting algorithm, and XGBoost (acronym for eXtreme Gradient Boosting.
Following code prepare the data matrix of train and test dataset where to be utilised for the implementation of the xgboost package.
# Transform the predictor matrix using dummy (or indicator or one-hot) encoding
matrix_predictors.train <-
as.matrix(sparse.model.matrix(insuranceclaim ~., data = insurance.data.train))[, -1]
matrix_predictors.test <-
as.matrix(sparse.model.matrix(insuranceclaim ~., data = insurance.data.test))[, -1]
Converting the insuramceclaim column to numeric and converting the train data to the format of xgb.DMatrix
# Train dataset
pred.train.gbm <- data.matrix(matrix_predictors.train) # predictors only
#convert factor to numeric
insurance.data.train.gbm <- as.numeric(as.character(insurance.data.train$insuranceclaim))
dtrain <- xgb.DMatrix(data = pred.train.gbm, label = insurance.data.train.gbm)
Converting the insuramceclaim column to numeric and converting the test data to the format of xgb.DMatrix
# Test dataset
pred.test.gbm <- data.matrix(matrix_predictors.test) # predictors only
#convert factor to numeric
insurance.data.test.gbm <- as.numeric(as.character(insurance.data.test$insuranceclaim))
dtest <- xgb.DMatrix(data = pred.test.gbm, label = insurance.data.test.gbm)
XGBoost model fitting with objective as binary:logistic and nrounds as 2 and the accuracy got improved on both the test and train data set.
watchlist <- list(train = dtrain, test = dtest)
param <- list(max_depth = 2, eta = 1, nthread = 2,
objective = "binary:logistic", eval_metric = "auc")
model.xgb <- xgb.train(param, dtrain, nrounds = 2, watchlist)
## [1] train-auc:0.868956 test-auc:0.880014
## [2] train-auc:0.930225 test-auc:0.929219
Following is the plot of the single tree obtain
# Assuming 'model.xgb' is our XGBoost model
tree_plot <- xgb.plot.tree(model = model.xgb, trees = 1, features_names = colnames(pred.train.gbm))
htmlwidgets::saveWidget(tree_plot, "tree_plot.html")
# Include the saved HTML file in our R Markdown document
knitr::include_graphics("tree_plot.html")
From the above plot we can interpret that the a condition was applied on the chargers column as it’s gain value is more with 74 where if it is less than 30175.7773 then it check for the children(gain is 73 ) else bmi(2.4) and later we have further splits based on this.
pred.y.train <- predict(model.xgb, pred.train.gbm)
prediction.train <- as.numeric(pred.y.train > 0.5)
# Measure prediction accuracy on train data
(tab<-table(insurance.data.train.gbm, prediction.train))
## prediction.train
## insurance.data.train.gbm 0 1
## 0 363 81
## 1 73 553
sum(diag(tab))/sum(tab)
## [1] 0.8560748
sensitivity(tab)
## [1] 0.8325688
specificity(tab)
## [1] 0.8722397
True Positive (TP): 553 cases where the actual class is 1, and the model predicted 1. True Negative (TN): 363 cases where the actual class is 0, and the model predicted 0. False Positive (FP): 81 cases where the actual class is 0, but the model predicted 1. False Negative (FN): 73 cases where the actual class is 1, but the model predicted 0.
Interpretation: Accuracy: (TP + TN) / (TP + TN + FP + FN) = (553 + 363) / (553 + 363 + 81 + 73) ≈ 86.3%. This is the proportion of correctly classified instances out of the total instances.
Precision (Positive Predictive Value): TP / (TP + FP) = 553 / (553 + 81) ≈ 87.2%. This is the proportion of instances predicted as positive that are actually positive.
Recall (Sensitivity, True Positive Rate): TP / (TP + FN) = 553 / (553 + 73) ≈ 88.3%. This is the proportion of actual positives that were correctly predicted as positive.
Specificity (True Negative Rate): TN / (TN + FP) = 363 / (363 + 81) ≈ 81.7%. This is the proportion of actual negatives that were correctly predicted as negative.
In summary, the model seems to have reasonably good performance on the training data, with a high accuracy, precision, recall, and specificity.
pred.y = predict(model.xgb, pred.test.gbm)
prediction <- as.numeric(pred.y > 0.5)
# Measure prediction accuracy on test data
(tab1<-table(insurance.data.test.gbm,prediction))
## prediction
## insurance.data.test.gbm 0 1
## 0 96 15
## 1 23 134
sensitivity(tab)
## [1] 0.8325688
specificity(tab)
## [1] 0.8722397
sum(diag(tab))/sum(tab)
## [1] 0.8560748
True Positive (TP): 134 cases where the actual class is 1, and the model predicted 1. True Negative (TN): 96 cases where the actual class is 0, and the model predicted 0. False Positive (FP): 15 cases where the actual class is 0, but the model predicted 1. False Negative (FN): 23 cases where the actual class is 1, but the model predicted 0.
Interpretation: Accuracy: (TP + TN) / (TP + TN + FP + FN) = (134 + 96) / (134 + 96 + 15 + 23) ≈ 85.7%. This is the proportion of correctly classified instances out of the total instances.
Precision (Positive Predictive Value): TP / (TP + FP) = 134 / (134 + 15) ≈ 89.9%. This is the proportion of instances predicted as positive that are actually positive.
Recall (Sensitivity, True Positive Rate): TP / (TP + FN) = 134 / (134 + 23) ≈ 85.3%. This is the proportion of actual positives that were correctly predicted as positive.
Specificity (True Negative Rate): TN / (TN + FP) = 96 / (96 + 15) ≈ 86.5%. This is the proportion of actual negatives that were correctly predicted as negative.
F1 Score: The harmonic mean of precision and recall. It provides a balance between precision and recall. 2 * (Precision * Recall) / (Precision + Recall).
In summary, the model appears to have reasonably good performance on the test data, with a high accuracy, precision, recall, and specificity. Similar to the training data
Following is the code to implement the gradiant descent algorithm with increased number of rounts in the algorithm from 2 to 10 and 15.
# 10 rounds from 2
watchlist <- list(train = dtrain, test = dtest)
param <- list(max_depth = 2, eta = 1, nthread = 2,
objective = "binary:logistic", eval_metric = "auc")
model.xgb <- xgb.train(param, dtrain, nrounds = 10, watchlist)
## [1] train-auc:0.868956 test-auc:0.880014
## [2] train-auc:0.930225 test-auc:0.929219
## [3] train-auc:0.951839 test-auc:0.932576
## [4] train-auc:0.969386 test-auc:0.951627
## [5] train-auc:0.973921 test-auc:0.954926
## [6] train-auc:0.976492 test-auc:0.961927
## [7] train-auc:0.976520 test-auc:0.959861
## [8] train-auc:0.982112 test-auc:0.965858
## [9] train-auc:0.982702 test-auc:0.966030
## [10] train-auc:0.985278 test-auc:0.967522
pred.y.train <- predict(model.xgb, pred.train.gbm)
prediction.train <- as.numeric(pred.y.train > 0.5)
# Measure prediction accuracy on train data
(tab<-table(insurance.data.train.gbm, prediction.train))
## prediction.train
## insurance.data.train.gbm 0 1
## 0 404 40
## 1 43 583
sum(diag(tab))/sum(tab)
## [1] 0.9224299
sensitivity(tab)
## [1] 0.9038031
specificity(tab)
## [1] 0.9357945
pred.y = predict(model.xgb, pred.test.gbm)
prediction <- as.numeric(pred.y > 0.5)
# Measure prediction accuracy on test data
(tab1<-table(insurance.data.test.gbm,prediction))
## prediction
## insurance.data.test.gbm 0 1
## 0 96 15
## 1 17 140
sensitivity(tab1)
## [1] 0.8495575
specificity(tab1)
## [1] 0.9032258
sum(diag(tab1))/sum(tab1)
## [1] 0.880597
Accuracy of the train data set is 92% where as the test data is 88% with the 10 number of rounds which is an improvement in the values with 2 number of rounds.
# 15 rounds
watchlist <- list(train = dtrain, test = dtest)
param <- list(max_depth = 2, eta = 1, nthread = 2,
objective = "binary:logistic", eval_metric = "auc")
model.xgb.15 <- xgb.train(param, dtrain, nrounds = 15, watchlist)
## [1] train-auc:0.868956 test-auc:0.880014
## [2] train-auc:0.930225 test-auc:0.929219
## [3] train-auc:0.951839 test-auc:0.932576
## [4] train-auc:0.969386 test-auc:0.951627
## [5] train-auc:0.973921 test-auc:0.954926
## [6] train-auc:0.976492 test-auc:0.961927
## [7] train-auc:0.976520 test-auc:0.959861
## [8] train-auc:0.982112 test-auc:0.965858
## [9] train-auc:0.982702 test-auc:0.966030
## [10] train-auc:0.985278 test-auc:0.967522
## [11] train-auc:0.985753 test-auc:0.965054
## [12] train-auc:0.986317 test-auc:0.968956
## [13] train-auc:0.988307 test-auc:0.975928
## [14] train-auc:0.988606 test-auc:0.975756
## [15] train-auc:0.989419 test-auc:0.976445
pred.y.train <- predict(model.xgb.15, pred.train.gbm)
prediction.train <- as.numeric(pred.y.train > 0.5)
# Measure prediction accuracy on train data
(tab<-table(insurance.data.train.gbm, prediction.train))
## prediction.train
## insurance.data.train.gbm 0 1
## 0 412 32
## 1 29 597
sum(diag(tab))/sum(tab)
## [1] 0.9429907
sensitivity(tab)
## [1] 0.9342404
specificity(tab)
## [1] 0.9491256
pred.y = predict(model.xgb.15, pred.test.gbm)
prediction <- as.numeric(pred.y > 0.5)
# Measure prediction accuracy on test data
(tab1<-table(insurance.data.test.gbm,prediction))
## prediction
## insurance.data.test.gbm 0 1
## 0 95 16
## 1 12 145
sensitivity(tab1)
## [1] 0.8878505
specificity(tab1)
## [1] 0.9006211
sum(diag(tab1))/sum(tab1)
## [1] 0.8955224
Accuracy of the train data set is 94% where as the test data is 89% with the 15 number of rounds which is an improvement in the values with 2 number of rounds.
#Support Vector Machines
# Classification and Regression Trees
library(e1071)
insurance.data.dup <- read.csv("~/Documents/GitHub/GitHub/Insurance-Claim-Prediction/data/insurance.csv")
insurance.data <- insurance.data.dup
set.seed(12345)
train.prop <- 0.80
strats <- insurance.data$insuranceclaim
rr <- split(1:length(strats), strats)
idx <- sort(as.numeric(unlist(sapply(rr,
function(x) sample(x, length(x)*train.prop)))))
insurance.data.train <- insurance.data[idx, ]
insurance.data.test <- insurance.data[-idx, ]
table(insurance.data.train$insuranceclaim)/nrow(insurance.data.train)
##
## 0 1
## 0.4149533 0.5850467
# Train an SVM classifier
svm_model <- svm(insuranceclaim ~ ., data = insurance.data.train, kernel = "linear", cost = 1)
svm_model
##
## Call:
## svm(formula = insuranceclaim ~ ., data = insurance.data.train, kernel = "linear",
## cost = 1)
##
##
## Parameters:
## SVM-Type: eps-regression
## SVM-Kernel: linear
## cost: 1
## gamma: 0.1428571
## epsilon: 0.1
##
##
## Number of Support Vectors: 981
# Make predictions on the test set
predictions <- predict(svm_model, newdata = insurance.data.test, type= "response")
prediction <- as.numeric(predictions > 0.5)
conf_matrix_svm <- table(prediction, insurance.data.test$insuranceclaim)
# Display the confusion matrix
print(conf_matrix_svm)
##
## prediction 0 1
## 0 98 22
## 1 13 135
sum(diag(conf_matrix_svm))/sum(conf_matrix_svm)
## [1] 0.869403
sensitivity(conf_matrix_svm)
## [1] 0.8828829
specificity(conf_matrix_svm)
## [1] 0.8598726
# Make predictions on the train set
predictions <- predict(svm_model, newdata = insurance.data.train, type= "response")
prediction <- as.numeric(predictions > 0.5)
conf_matrix_svm <- table(prediction, insurance.data.train$insuranceclaim)
# Display the confusion matrix
print(conf_matrix_svm)
##
## prediction 0 1
## 0 370 69
## 1 74 557
sum(diag(conf_matrix_svm))/sum(conf_matrix_svm)
## [1] 0.8663551
sensitivity(conf_matrix_svm)
## [1] 0.8333333
specificity(conf_matrix_svm)
## [1] 0.8897764
plot(svm_model, insurance.data.train)
| Parameter | Value |
|---|---|
| SVM-Type | eps-regression |
| SVM-Kernel | linear |
| Cost | 1 |
| Gamma | 0.1428571 |
| Epsilon | 0.1 |
| Support Vectors |
|---|
| 981 |
| Actual / Predicted | 0 | 1 |
|---|---|---|
| 0 | 370 | 69 |
| 1 | 74 | 557 |
| Actual / Predicted | 0 | 1 |
|---|---|---|
| 0 | 98 | 22 |
| 1 | 13 | 135 |